/**
 * Copyright (c) 2021 OceanBase
 * OceanBase CE is licensed under Mulan PubL v2.
 * You can use this software according to the terms and conditions of the Mulan PubL v2.
 * You may obtain a copy of Mulan PubL v2 at:
 *          http://license.coscl.org.cn/MulanPubL-2.0
 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
 * See the Mulan PubL v2 for more details.
 */

#define USING_LOG_PREFIX LIB_CHARSET
#include "lib/charset/ob_charset.h"
#include "lib/utility/serialization.h"
#include "lib/ob_define.h"
#include "lib/worker.h"
#include "common/ob_common_utility.h"

namespace oceanbase
{
namespace common
{

// BEGIN displayed length {{{1
// ref: https://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c

struct interval {
  int first;
  int last;
};

/* auxiliary function for binary search in interval table */
static int bisearch(ob_wc_t ucs, const struct interval *table, int max) {
  int min = 0;
  int mid;

  if (ucs < table[0].first || ucs > table[max].last)
    return 0;
  while (max >= min) {
    mid = (min + max) / 2;
    if (ucs > table[mid].last)
      min = mid + 1;
    else if (ucs < table[mid].first)
      max = mid - 1;
    else
      return 1;
  }

  return 0;
}


/* The following two functions define the column width of an ISO 10646
 * character as follows:
 *
 *    - The null character (U+0000) has a column width of 0.
 *
 *    - Other C0/C1 control characters and DEL will lead to a return
 *      value of -1.
 *
 *    - Non-spacing and enclosing combining characters (general
 *      category code Mn or Me in the Unicode database) have a
 *      column width of 0.
 *
 *    - SOFT HYPHEN (U+00AD) has a column width of 1.
 *
 *    - Other format characters (general category code Cf in the Unicode
 *      database) and ZERO WIDTH SPACE (U+200B) have a column width of 0.
 *
 *    - Hangul Jamo medial vowels and final consonants (U+1160-U+11FF)
 *      have a column width of 0.
 *
 *    - Spacing characters in the East Asian Wide (W) or East Asian
 *      Full-width (F) category as defined in Unicode Technical
 *      Report #11 have a column width of 2.
 *
 *    - All remaining characters (including all printable
 *      ISO 8859-1 and WGL4 characters, Unicode control characters,
 *      etc.) have a column width of 1.
 *
 * This implementation assumes that wchar_t characters are encoded
 * in ISO 10646.
 */

int mk_wcwidth(ob_wc_t ucs)
{
  /* sorted list of non-overlapping intervals of non-spacing characters */
  /* generated by "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c" */
  static const struct interval combining[] = {
    { 0x0300, 0x036F }, { 0x0483, 0x0486 }, { 0x0488, 0x0489 },
    { 0x0591, 0x05BD }, { 0x05BF, 0x05BF }, { 0x05C1, 0x05C2 },
    { 0x05C4, 0x05C5 }, { 0x05C7, 0x05C7 }, { 0x0600, 0x0603 },
    { 0x0610, 0x0615 }, { 0x064B, 0x065E }, { 0x0670, 0x0670 },
    { 0x06D6, 0x06E4 }, { 0x06E7, 0x06E8 }, { 0x06EA, 0x06ED },
    { 0x070F, 0x070F }, { 0x0711, 0x0711 }, { 0x0730, 0x074A },
    { 0x07A6, 0x07B0 }, { 0x07EB, 0x07F3 }, { 0x0901, 0x0902 },
    { 0x093C, 0x093C }, { 0x0941, 0x0948 }, { 0x094D, 0x094D },
    { 0x0951, 0x0954 }, { 0x0962, 0x0963 }, { 0x0981, 0x0981 },
    { 0x09BC, 0x09BC }, { 0x09C1, 0x09C4 }, { 0x09CD, 0x09CD },
    { 0x09E2, 0x09E3 }, { 0x0A01, 0x0A02 }, { 0x0A3C, 0x0A3C },
    { 0x0A41, 0x0A42 }, { 0x0A47, 0x0A48 }, { 0x0A4B, 0x0A4D },
    { 0x0A70, 0x0A71 }, { 0x0A81, 0x0A82 }, { 0x0ABC, 0x0ABC },
    { 0x0AC1, 0x0AC5 }, { 0x0AC7, 0x0AC8 }, { 0x0ACD, 0x0ACD },
    { 0x0AE2, 0x0AE3 }, { 0x0B01, 0x0B01 }, { 0x0B3C, 0x0B3C },
    { 0x0B3F, 0x0B3F }, { 0x0B41, 0x0B43 }, { 0x0B4D, 0x0B4D },
    { 0x0B56, 0x0B56 }, { 0x0B82, 0x0B82 }, { 0x0BC0, 0x0BC0 },
    { 0x0BCD, 0x0BCD }, { 0x0C3E, 0x0C40 }, { 0x0C46, 0x0C48 },
    { 0x0C4A, 0x0C4D }, { 0x0C55, 0x0C56 }, { 0x0CBC, 0x0CBC },
    { 0x0CBF, 0x0CBF }, { 0x0CC6, 0x0CC6 }, { 0x0CCC, 0x0CCD },
    { 0x0CE2, 0x0CE3 }, { 0x0D41, 0x0D43 }, { 0x0D4D, 0x0D4D },
    { 0x0DCA, 0x0DCA }, { 0x0DD2, 0x0DD4 }, { 0x0DD6, 0x0DD6 },
    { 0x0E31, 0x0E31 }, { 0x0E34, 0x0E3A }, { 0x0E47, 0x0E4E },
    { 0x0EB1, 0x0EB1 }, { 0x0EB4, 0x0EB9 }, { 0x0EBB, 0x0EBC },
    { 0x0EC8, 0x0ECD }, { 0x0F18, 0x0F19 }, { 0x0F35, 0x0F35 },
    { 0x0F37, 0x0F37 }, { 0x0F39, 0x0F39 }, { 0x0F71, 0x0F7E },
    { 0x0F80, 0x0F84 }, { 0x0F86, 0x0F87 }, { 0x0F90, 0x0F97 },
    { 0x0F99, 0x0FBC }, { 0x0FC6, 0x0FC6 }, { 0x102D, 0x1030 },
    { 0x1032, 0x1032 }, { 0x1036, 0x1037 }, { 0x1039, 0x1039 },
    { 0x1058, 0x1059 }, { 0x1160, 0x11FF }, { 0x135F, 0x135F },
    { 0x1712, 0x1714 }, { 0x1732, 0x1734 }, { 0x1752, 0x1753 },
    { 0x1772, 0x1773 }, { 0x17B4, 0x17B5 }, { 0x17B7, 0x17BD },
    { 0x17C6, 0x17C6 }, { 0x17C9, 0x17D3 }, { 0x17DD, 0x17DD },
    { 0x180B, 0x180D }, { 0x18A9, 0x18A9 }, { 0x1920, 0x1922 },
    { 0x1927, 0x1928 }, { 0x1932, 0x1932 }, { 0x1939, 0x193B },
    { 0x1A17, 0x1A18 }, { 0x1B00, 0x1B03 }, { 0x1B34, 0x1B34 },
    { 0x1B36, 0x1B3A }, { 0x1B3C, 0x1B3C }, { 0x1B42, 0x1B42 },
    { 0x1B6B, 0x1B73 }, { 0x1DC0, 0x1DCA }, { 0x1DFE, 0x1DFF },
    { 0x200B, 0x200F }, { 0x202A, 0x202E }, { 0x2060, 0x2063 },
    { 0x206A, 0x206F }, { 0x20D0, 0x20EF }, /*{ 0x302A, 0x302F },*/
    /*{ 0x3099, 0x309A },*/ { 0xA806, 0xA806 }, { 0xA80B, 0xA80B },
    { 0xA825, 0xA826 }, { 0xFB1E, 0xFB1E }, { 0xFE00, 0xFE0F },
    { 0xFE20, 0xFE23 }, { 0xFEFF, 0xFEFF }, { 0xFFF9, 0xFFFB },
    { 0x10A01, 0x10A03 }, { 0x10A05, 0x10A06 }, { 0x10A0C, 0x10A0F },
    { 0x10A38, 0x10A3A }, { 0x10A3F, 0x10A3F }, { 0x1D167, 0x1D169 },
    { 0x1D173, 0x1D182 }, { 0x1D185, 0x1D18B }, { 0x1D1AA, 0x1D1AD },
    { 0x1D242, 0x1D244 }, { 0xE0001, 0xE0001 }, { 0xE0020, 0xE007F },
    { 0xE0100, 0xE01EF }
  };

  /* test for 8-bit control characters */
  if (ucs == 0)
    return 0;
  if (ucs < 32 || (ucs >= 0x7f && ucs < 0xa0))
    return -1;

  /* binary search in table of non-spacing characters */
  if (0 != bisearch(ucs, combining,
               sizeof(combining) / sizeof(struct interval) - 1))
    return 0;

  /* if we arrive here, ucs is not a combining or C0/C1 control character */

  return 1 +
      (ucs >= 0x1100 &&
       (ucs <= 0x115f ||                    /* Hangul Jamo init. consonants */
        ucs == 0x2329 || ucs == 0x232a ||
        (ucs >= 0x2e80 && ucs <= 0xa4cf &&
         ucs != 0x303f) ||                  /* CJK ... Yi */
        (ucs >= 0xac00 && ucs <= 0xd7a3) || /* Hangul Syllables */
        (ucs >= 0xf900 && ucs <= 0xfaff) || /* CJK Compatibility Ideographs */
        (ucs >= 0xfe10 && ucs <= 0xfe19) || /* Vertical forms */
        (ucs >= 0xfe30 && ucs <= 0xfe6f) || /* CJK Compatibility Forms */
        (ucs >= 0xff00 && ucs <= 0xff60) || /* Fullwidth Forms */
        (ucs >= 0xffe0 && ucs <= 0xffe6) ||
        (ucs >= 0x20000 && ucs <= 0x2fffd) ||
        (ucs >= 0x30000 && ucs <= 0x3fffd)));
}

int mk_wcswidth(const wchar_t *pwcs, size_t n)
{
  int w, width = 0;

  for (;*pwcs && n-- > 0; pwcs++)
    if ((w = mk_wcwidth(*pwcs)) < 0)
      return -1;
    else
      width += w;

  return width;
}


/*
 * The following functions are the same as mk_wcwidth() and
 * mk_wcswidth(), except that spacing characters in the East Asian
 * Ambiguous (A) category as defined in Unicode Technical Report #11
 * have a column width of 2. This variant might be useful for users of
 * CJK legacy encodings who want to migrate to UCS without changing
 * the traditional terminal character-width behaviour. It is not
 * otherwise recommended for general use.
 */
int mk_wcwidth_cjk(wchar_t ucs)
{
  /* sorted list of non-overlapping intervals of East Asian Ambiguous
   * characters, generated by "uniset +WIDTH-A -cat=Me -cat=Mn -cat=Cf c" */
  static const struct interval ambiguous[] = {
    { 0x00A1, 0x00A1 }, { 0x00A4, 0x00A4 }, { 0x00A7, 0x00A8 },
    { 0x00AA, 0x00AA }, { 0x00AE, 0x00AE }, { 0x00B0, 0x00B4 },
    { 0x00B6, 0x00BA }, { 0x00BC, 0x00BF }, { 0x00C6, 0x00C6 },
    { 0x00D0, 0x00D0 }, { 0x00D7, 0x00D8 }, { 0x00DE, 0x00E1 },
    { 0x00E6, 0x00E6 }, { 0x00E8, 0x00EA }, { 0x00EC, 0x00ED },
    { 0x00F0, 0x00F0 }, { 0x00F2, 0x00F3 }, { 0x00F7, 0x00FA },
    { 0x00FC, 0x00FC }, { 0x00FE, 0x00FE }, { 0x0101, 0x0101 },
    { 0x0111, 0x0111 }, { 0x0113, 0x0113 }, { 0x011B, 0x011B },
    { 0x0126, 0x0127 }, { 0x012B, 0x012B }, { 0x0131, 0x0133 },
    { 0x0138, 0x0138 }, { 0x013F, 0x0142 }, { 0x0144, 0x0144 },
    { 0x0148, 0x014B }, { 0x014D, 0x014D }, { 0x0152, 0x0153 },
    { 0x0166, 0x0167 }, { 0x016B, 0x016B }, { 0x01CE, 0x01CE },
    { 0x01D0, 0x01D0 }, { 0x01D2, 0x01D2 }, { 0x01D4, 0x01D4 },
    { 0x01D6, 0x01D6 }, { 0x01D8, 0x01D8 }, { 0x01DA, 0x01DA },
    { 0x01DC, 0x01DC }, { 0x0251, 0x0251 }, { 0x0261, 0x0261 },
    { 0x02C4, 0x02C4 }, { 0x02C7, 0x02C7 }, { 0x02C9, 0x02CB },
    { 0x02CD, 0x02CD }, { 0x02D0, 0x02D0 }, { 0x02D8, 0x02DB },
    { 0x02DD, 0x02DD }, { 0x02DF, 0x02DF }, { 0x0391, 0x03A1 },
    { 0x03A3, 0x03A9 }, { 0x03B1, 0x03C1 }, { 0x03C3, 0x03C9 },
    { 0x0401, 0x0401 }, { 0x0410, 0x044F }, { 0x0451, 0x0451 },
    { 0x2010, 0x2010 }, { 0x2013, 0x2016 }, { 0x2018, 0x2019 },
    { 0x201C, 0x201D }, { 0x2020, 0x2022 }, { 0x2024, 0x2027 },
    { 0x2030, 0x2030 }, { 0x2032, 0x2033 }, { 0x2035, 0x2035 },
    { 0x203B, 0x203B }, { 0x203E, 0x203E }, { 0x2074, 0x2074 },
    { 0x207F, 0x207F }, { 0x2081, 0x2084 }, { 0x20AC, 0x20AC },
    { 0x2103, 0x2103 }, { 0x2105, 0x2105 }, { 0x2109, 0x2109 },
    { 0x2113, 0x2113 }, { 0x2116, 0x2116 }, { 0x2121, 0x2122 },
    { 0x2126, 0x2126 }, { 0x212B, 0x212B }, { 0x2153, 0x2154 },
    { 0x215B, 0x215E }, { 0x2160, 0x216B }, { 0x2170, 0x2179 },
    { 0x2190, 0x2199 }, { 0x21B8, 0x21B9 }, { 0x21D2, 0x21D2 },
    { 0x21D4, 0x21D4 }, { 0x21E7, 0x21E7 }, { 0x2200, 0x2200 },
    { 0x2202, 0x2203 }, { 0x2207, 0x2208 }, { 0x220B, 0x220B },
    { 0x220F, 0x220F }, { 0x2211, 0x2211 }, { 0x2215, 0x2215 },
    { 0x221A, 0x221A }, { 0x221D, 0x2220 }, { 0x2223, 0x2223 },
    { 0x2225, 0x2225 }, { 0x2227, 0x222C }, { 0x222E, 0x222E },
    { 0x2234, 0x2237 }, { 0x223C, 0x223D }, { 0x2248, 0x2248 },
    { 0x224C, 0x224C }, { 0x2252, 0x2252 }, { 0x2260, 0x2261 },
    { 0x2264, 0x2267 }, { 0x226A, 0x226B }, { 0x226E, 0x226F },
    { 0x2282, 0x2283 }, { 0x2286, 0x2287 }, { 0x2295, 0x2295 },
    { 0x2299, 0x2299 }, { 0x22A5, 0x22A5 }, { 0x22BF, 0x22BF },
    { 0x2312, 0x2312 }, { 0x2460, 0x24E9 }, { 0x24EB, 0x254B },
    { 0x2550, 0x2573 }, { 0x2580, 0x258F }, { 0x2592, 0x2595 },
    { 0x25A0, 0x25A1 }, { 0x25A3, 0x25A9 }, { 0x25B2, 0x25B3 },
    { 0x25B6, 0x25B7 }, { 0x25BC, 0x25BD }, { 0x25C0, 0x25C1 },
    { 0x25C6, 0x25C8 }, { 0x25CB, 0x25CB }, { 0x25CE, 0x25D1 },
    { 0x25E2, 0x25E5 }, { 0x25EF, 0x25EF }, { 0x2605, 0x2606 },
    { 0x2609, 0x2609 }, { 0x260E, 0x260F }, { 0x2614, 0x2615 },
    { 0x261C, 0x261C }, { 0x261E, 0x261E }, { 0x2640, 0x2640 },
    { 0x2642, 0x2642 }, { 0x2660, 0x2661 }, { 0x2663, 0x2665 },
    { 0x2667, 0x266A }, { 0x266C, 0x266D }, { 0x266F, 0x266F },
    { 0x273D, 0x273D }, { 0x2776, 0x277F }, { 0xE000, 0xF8FF },
    { 0xFFFD, 0xFFFD }, { 0xF0000, 0xFFFFD }, { 0x100000, 0x10FFFD }
  };

  /* binary search in table of non-spacing characters */
  if (bisearch(ucs, ambiguous,
         sizeof(ambiguous) / sizeof(struct interval) - 1))
    return 2;

  return mk_wcwidth(ucs);
}


int mk_wcswidth_cjk(const wchar_t *pwcs, size_t n)
{
  int w, width = 0;

  for (;*pwcs && n-- > 0; pwcs++)
    if ((w = mk_wcwidth_cjk(*pwcs)) < 0)
      return -1;
    else
      width += w;

  return width;
}

// END displayed length }}}

const ObCharsetWrapper ObCharset::charset_wrap_arr_[ObCharset::VALID_CHARSET_TYPES] =
{
  {CHARSET_BINARY, "Binary pseudo charset", CS_TYPE_BINARY, 1},
  {CHARSET_UTF8MB4, "UTF-8 Unicode", CS_TYPE_UTF8MB4_GENERAL_CI, 4},
  {CHARSET_GBK, "GBK charset", CS_TYPE_GBK_CHINESE_CI, 2},
  {CHARSET_UTF16, "UTF-16 Unicode", CS_TYPE_UTF16_GENERAL_CI, 2},
  {CHARSET_GB18030, "GB18030 charset", CS_TYPE_GB18030_CHINESE_CI, 4},
  {CHARSET_LATIN1, "cp1252 West European", CS_TYPE_LATIN1_SWEDISH_CI, 1},
  {CHARSET_GB18030_2022, "GB18030-2022 charset", CS_TYPE_GB18030_2022_PINYIN_CI, 4},
};

const ObCollationWrapper ObCharset::collation_wrap_arr_[ObCharset::VALID_COLLATION_TYPES] =
{
  {CS_TYPE_UTF8MB4_GENERAL_CI, CHARSET_UTF8MB4, CS_TYPE_UTF8MB4_GENERAL_CI, true, true, 1},
  {CS_TYPE_UTF8MB4_BIN, CHARSET_UTF8MB4, CS_TYPE_UTF8MB4_BIN, false, true, 1},
  {CS_TYPE_BINARY, CHARSET_BINARY, CS_TYPE_BINARY, true, true, 1},
  {CS_TYPE_GBK_CHINESE_CI, CHARSET_GBK, CS_TYPE_GBK_CHINESE_CI, true, true, 1},
  {CS_TYPE_GBK_BIN, CHARSET_GBK, CS_TYPE_GBK_BIN, false, true, 1},
  {CS_TYPE_UTF16_GENERAL_CI, CHARSET_UTF16, CS_TYPE_UTF16_GENERAL_CI, true, true, 1},
  {CS_TYPE_UTF16_BIN, CHARSET_UTF16, CS_TYPE_UTF16_BIN, false, true, 1},
  //{CS_TYPE_UTF8MB4_ZH_0900_AS_CS, CHARSET_UTF8MB4, CS_TYPE_UTF8MB4_ZH_0900_AS_CS, false, true, 0},
  {CS_TYPE_UTF8MB4_UNICODE_CI, CHARSET_UTF8MB4, CS_TYPE_UTF8MB4_UNICODE_CI, false, true, 1},
  {CS_TYPE_UTF16_UNICODE_CI, CHARSET_UTF16, CS_TYPE_UTF16_UNICODE_CI, false, true, 1},
  {CS_TYPE_GB18030_CHINESE_CI, CHARSET_GB18030, CS_TYPE_GB18030_CHINESE_CI, true, true, 1},
  {CS_TYPE_GB18030_BIN, CHARSET_GB18030, CS_TYPE_GB18030_BIN, false, true, 1},
  {CS_TYPE_LATIN1_SWEDISH_CI, CHARSET_LATIN1, CS_TYPE_LATIN1_SWEDISH_CI,true, true, 1},
  {CS_TYPE_LATIN1_BIN, CHARSET_LATIN1, CS_TYPE_LATIN1_BIN,false, true, 1},
  {CS_TYPE_GB18030_2022_BIN, CHARSET_GB18030_2022, CS_TYPE_GB18030_2022_BIN, false, true, 1},
  {CS_TYPE_GB18030_2022_PINYIN_CI, CHARSET_GB18030_2022, CS_TYPE_GB18030_2022_PINYIN_CI, true, true, 1},
  {CS_TYPE_GB18030_2022_PINYIN_CS, CHARSET_GB18030_2022, CS_TYPE_GB18030_2022_PINYIN_CS, false, true, 1},
  {CS_TYPE_GB18030_2022_RADICAL_CI, CHARSET_GB18030_2022, CS_TYPE_GB18030_2022_RADICAL_CI, false, true, 1},
  {CS_TYPE_GB18030_2022_RADICAL_CS, CHARSET_GB18030_2022, CS_TYPE_GB18030_2022_RADICAL_CS, false, true, 1},
  {CS_TYPE_GB18030_2022_STROKE_CI, CHARSET_GB18030_2022, CS_TYPE_GB18030_2022_STROKE_CI, false, true, 1},
  {CS_TYPE_GB18030_2022_STROKE_CS, CHARSET_GB18030_2022, CS_TYPE_GB18030_2022_STROKE_CS, false, true, 1},
};

ObCharsetInfo *ObCharset::charset_arr[CS_TYPE_MAX] = {
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,                 // 0 ~ 7
  &ob_charset_latin1, NULL, NULL, NULL, NULL, NULL, NULL, NULL,   // 8
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,                 // 16
  NULL, NULL, NULL, NULL, &ob_charset_gbk_chinese_ci,             // 24
                                NULL, NULL, NULL,                 // 29
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,                 // 32
  NULL, NULL, NULL, NULL, NULL,                                   // 40
                                &ob_charset_utf8mb4_general_ci,   // 45
                                      &ob_charset_utf8mb4_bin,    // 46
                                      &ob_charset_latin1_bin,     // 47
  NULL, NULL, NULL, NULL, NULL, NULL,                             // 48
                                     &ob_charset_utf16_general_ci,// 54
                                     &ob_charset_utf16_bin,       // 55
  NULL, NULL, NULL, NULL, NULL, NULL, NULL,                       // 56
                                            &ob_charset_bin,      // 63
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,                 // 64
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,                 // 72
  NULL, NULL, NULL, NULL, NULL, NULL, NULL,                       // 80
                                           &ob_charset_gbk_bin,   // 87
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,                 // 88
  NULL, NULL, NULL, NULL, NULL,                                   // 96
                                &ob_charset_utf16_unicode_ci,     // 101
                                      NULL, NULL,                 // 102
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,                 // 104
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,                 // 112
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,                 // 120
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,                 // 128
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,                 // 136
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,                 // 144
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,                 // 152
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,                 // 160
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,                 // 168
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,                 // 176
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,                 // 184
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,                 // 192
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,                 // 200
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,                 // 208
  &ob_charset_gb18030_2022_bin,        &ob_charset_gb18030_2022_pinyin_ci, // 216
  &ob_charset_gb18030_2022_pinyin_cs,  &ob_charset_gb18030_2022_radical_ci,// 218
  &ob_charset_gb18030_2022_radical_cs, &ob_charset_gb18030_2022_stroke_ci, // 220
  &ob_charset_gb18030_2022_stroke_cs, NULL,                       // 222
  &ob_charset_utf8mb4_unicode_ci,                                 // 224
        NULL, NULL, NULL, NULL, NULL, NULL, NULL,                 // 225
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,                 // 232
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,                 // 240
  &ob_charset_gb18030_chinese_ci,                                 // 248
  &ob_charset_gb18030_bin,                                        // 249
              NULL, &ob_charset_gb18030_chinese_cs,    		  // 250
  NULL, NULL, NULL, NULL,                 			  // 252
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,                 // 256
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,                 // 264
  NULL                                                            // 272
};

double ObCharset::strntodv2(const char *str,
                          size_t str_len,
                          char **endptr,
                          int *err)
{
  double result = 0.0;
  if (lib::is_oracle_mode()) {
    ObString str_orig(str_len, str);
    ObString str_trim = str_orig.trim();
    if ((str_trim.case_compare("NAN") == 0)
            || (str_trim.case_compare("-NAN") == 0)
            || (str_trim.case_compare("+NAN") == 0)) {
      result = NAN;
      *endptr = str_trim.ptr() + str_trim.length();
    } else if ((str_trim.case_compare("+INFINITY") == 0)
           || (str_trim.case_compare("INFINITY") == 0)
           || (str_trim.case_compare("INF") == 0)) {
      result = INFINITY;
      *endptr = str_trim.ptr() + str_trim.length();
    } else if ((str_trim.case_compare("-INFINITY") == 0)
            || (str_trim.case_compare("-INF") == 0)) {
      result = -INFINITY;
      *endptr = str_trim.ptr() + str_trim.length();
    } else {
      result = strntod(str, str_len, endptr, err);
    }
  } else {
    result = strntod(str, str_len, endptr, err);
  }

  return result;
}

double ObCharset::strntod(const char *str,
                          size_t str_len,
                          char **endptr,
                          int *err)
{
  ObCharsetInfo *cs = &ob_charset_bin;
  double result = 0.0;
  if (is_argument_valid(cs, str, str_len)) {
    result = cs->cset->strntod(cs, const_cast<char *>(str), str_len, endptr, err);
  }
  return result;
}

int64_t ObCharset::strntoll(const char *str,
                            size_t str_len,
                            int base,
                            char **end_ptr,
                            int *err)
{
  ObCharsetInfo *cs = &ob_charset_bin;
  *end_ptr = const_cast<char*>(str);
  int64_t result = 0;
  if (is_argument_valid(cs, str, str_len)) {
    result = cs->cset->strntoll(cs, str, str_len, base, end_ptr, err);
  }
  return result;
}

uint64_t ObCharset::strntoull(const char *str,
                              size_t str_len,
                              int base,
                              char **end_ptr,
                              int *err)
{
  ObCharsetInfo *cs = &ob_charset_bin;
  *end_ptr = const_cast<char*>(str);
  uint64_t result = 0;
  if (is_argument_valid(cs, str, str_len)) {
    result = cs->cset->strntoull(cs,
                             str,
                             str_len,
                             base,
                             end_ptr,
                             err);
  }
  return result;
}
int64_t ObCharset::strntoll(const char *str,
                            size_t str_len,
                            int base,
                            int *err)
{
  ObCharsetInfo *cs = &ob_charset_bin;
  char *end_ptr = NULL;
  int64_t result = 0;
  if (is_argument_valid(cs, str, str_len)) {
    result = cs->cset->strntoll(cs, str, str_len, base, &end_ptr, err);
  }
  return result;
}

uint64_t ObCharset::strntoull(const char *str,
                              size_t str_len,
                              int base,
                              int *err)
{
  ObCharsetInfo *cs = &ob_charset_bin;
  char *end_ptr = NULL;
  uint64_t result = 0;
  if (is_argument_valid(cs, str, str_len)) {
    result = cs->cset->strntoull(cs,
                             str,
                             str_len,
                             base,
                             &end_ptr,
                             err);
  }
  return result;
}
uint64_t ObCharset::strntoullrnd(const char *str,
                                 size_t str_len,
                                 int unsigned_fl,
                                 char **endptr,
                                 int *err)
{
  ObCharsetInfo *cs = &ob_charset_bin;
  uint64_t result = 0;
  if (is_argument_valid(cs, str, str_len)) {
    result = cs->cset->strntoull10rnd(cs,
                                  str,
                                  str_len,
                                  unsigned_fl,
                                  endptr,
                                  err);
  }
  return result;
}

char* ObCharset::lltostr(int64_t val, char *dst, int radix, int upcase)
{
  int ret = OB_SUCCESS;
  static const int64_t MAX_BUFFER_SIZE = 65;//ok for int64min
  static char DIG_VEC_UPPER[] =
    "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
  static char DIG_VEC_LOWER[] =
    "0123456789abcdefghijklmnopqrstuvwxyz";
  //we do not take '\0' into consideration. '\0' terminated string is not expected
  //use dst(start) and pret(end) to locate string, please.
  char buffer[MAX_BUFFER_SIZE];
  char *p = NULL;
  long int new_val = 0;
  char *dig_vec= upcase ? DIG_VEC_UPPER : DIG_VEC_LOWER;
  uint64_t uval= (uint64_t) val;
  char *pret = NULL;
  if (radix < 0) {
    if (radix < -36 || radix > -2) {
      ret = OB_INVALID_ARGUMENT;
      LOG_WARN("invalid radix", K(ret), K(radix));
    } else {
      if (val < 0) {
        *dst++ = '-';
        uval = (uint64_t)0 - uval;
      }
      radix = -radix;
    }
  } else if (radix > 36 || radix < 2) {
    ret = OB_INVALID_ARGUMENT;
    LOG_WARN("invalid radix", K(ret), K(radix));
  }

  /*
    The slightly contorted code which follows is due to the fact that
    few machines directly support unsigned long / and %.  Certainly
    the VAX C compiler generates a subroutine call.  In the interests
    of efficiency (hollow laugh) I let this happen for the first digit
    only; after that "val" will be in range so that signed integer
    division will do.  Sorry 'bout that.  CHECK THE CODE PRODUCED BY
    YOUR C COMPILER.  The first % and / should be unsigned, the second
    % and / signed, but C compilers tend to be extraordinarily
    sensitive to minor details of style.  This works on a VAX, that's
    all I claim for it.
  */
  if (OB_SUCC(ret)) {
    p = &buffer[sizeof(buffer)-1];
    *p = '\0';
    new_val= uval / (uint64_t) radix;
    *--p = dig_vec[(unsigned char) (uval- (uint64_t) new_val*(uint64_t) radix)];
    val = new_val;
    ldiv_t res;
    while (val != 0)
    {
      res=ldiv(val,radix);
      *--p = dig_vec[res.rem];
      val= res.quot;
    }
    while ((*dst++ = *p++) != 0) ;
    pret = dst - 1;
  }
  return pret;
}

size_t ObCharset::scan_str(const char *str,
                           const char *end,
                           int sq)
{
  ObCharsetInfo *cs = &ob_charset_bin;
  size_t result = 0;
  if (OB_ISNULL(str) || OB_ISNULL(end) || OB_ISNULL(cs)) {
    BACKTRACE_RET(ERROR, OB_INVALID_ARGUMENT, true, "invalid argument. str = %p, end = %p, cs = %p", str, end, cs);
  } else {
    result = cs->cset->scan(cs, str, end, sq);
  }
  return result;
}
uint32_t ObCharset::instr(ObCollationType collation_type,
                          const char *str1,
                          int64_t str1_len,
                          const char *str2,
                          int64_t str2_len)
{
  uint32_t result = 0;
  if (is_argument_valid(collation_type, str1, str1_len, str2, str2_len)) {
    ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
    ob_match_t m_match_t[2];
    unsigned int nmatch = 1;
    unsigned int m_ret = cs->coll->instr(cs, str1, str1_len, str2, str2_len, m_match_t, nmatch);
    if (0 == m_ret ) {
      result = 0;
    } else {
      result =  m_match_t[0].mb_len + 1;
    }
  }
  return result;
}

int64_t ObCharset::instrb(ObCollationType collation_type,
                          const char *str1,
                          int64_t str1_len,
                          const char *str2,
                          int64_t str2_len)
{
  int64_t result = -1;
  if (is_argument_valid(collation_type, str1, str1_len, str2, str2_len)) {
    ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
    ob_match_t m_match_t[2];
    unsigned int nmatch = 1;
    unsigned int m_ret = cs->coll->instr(cs, str1, str1_len, str2, str2_len, m_match_t, nmatch);
    if (0 != m_ret) {
      result =  m_match_t[0].end - m_match_t[0].beg;
    }
  }
  return result;
}

uint32_t ObCharset::locate(ObCollationType collation_type,
                const char *str1,
                int64_t str1_len,
                const char *str2,
                int64_t str2_len,
                int64_t pos)
{
  uint32_t result = 0;
  if (is_argument_valid(collation_type, str1, str1_len, str2, str2_len)) {
    ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
    int64_t start0 = pos - 1;
    int64_t start = start0;
    if (OB_UNLIKELY(start < 0 || start > str1_len)) {
      result = 0;
    } else {
      int ret = OB_SUCCESS;
      start = static_cast<int64_t>(charpos(collation_type, str1, str1_len, start, &ret));
      if (OB_FAIL(ret)) {
        result = 0;
      } else if (static_cast<int64_t>(start) + str2_len > str1_len) {
        result = 0;
      } else if (0 == str2_len) {
        result = static_cast<uint32_t>(start) + 1;
      } else {
        ob_match_t match_t;
        uint32_t nmatch = 1;
        uint32_t m_ret = cs->coll->instr(cs, str1 + start, str1_len - start, str2, str2_len, &match_t, nmatch);
        if (0 == m_ret) {
          result = 0;
        } else {
          result = match_t.mb_len + static_cast<uint32_t>(start0) + 1;
        }
      }
    }
  }
  return result;
}

int ObCharset::strcmp(ObCollationType collation_type,
                      const char *str1,
                      int64_t str1_len,
                      const char *str2,
                      int64_t str2_len)
{
  int result = 0;
  if (is_argument_valid(collation_type, str1, str1_len, str2, str2_len)) {
    ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
    const bool t_is_prefix = false;
    result = cs->coll->strnncoll(cs,
                              reinterpret_cast<const unsigned char *>(str1),
                              str1_len,
                              reinterpret_cast<const unsigned char *>(str2),
                              str2_len, t_is_prefix);
  }
  return result;
}

int ObCharset::strcmpsp(ObCollationType collation_type,
                        const char *str1,
                        int64_t str1_len,
                        const char *str2,
                        int64_t str2_len,
                        bool cmp_endspace)
{
  int result = 0;
  if (is_argument_valid(collation_type, str1, str1_len, str2, str2_len)) {
    ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
    result = cs->coll->strnncollsp(cs,
                                reinterpret_cast<const unsigned char *>(str1),
                                str1_len,
                                reinterpret_cast<const unsigned char *>(str2),
                                str2_len,
                                cmp_endspace);
  }
  return result;
}

size_t ObCharset::casedn(const ObCollationType collation_type, char *src, size_t src_len,
              char *dest, size_t dest_len)
{
  size_t size = 0;
  if (is_argument_valid(collation_type, src, src_len, dest, dest_len)) {
    ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
    size = cs->cset->casedn(cs, src, src_len, dest, dest_len);
  }
  return size;
}

size_t ObCharset::caseup(const ObCollationType collation_type, char *src, size_t src_len,
                         char *dest, size_t dest_len)
{
  size_t size = 0;
  if (is_argument_valid(collation_type, src, src_len, dest, dest_len)) {
    ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
    size = cs->cset->caseup(cs, src, src_len, dest, dest_len);
  }
  return size;
}

/**
 * @brief allocate new buf and do caseup
 */
int ObCharset::caseup(const ObCollationType collation_type,
                      const ObString &src,
                      ObString &dst,
                      ObIAllocator &allocator)
{
  int ret = OB_SUCCESS;

  if (OB_UNLIKELY(!is_valid_collation(collation_type))) {
    ret = OB_INVALID_ARGUMENT;
    LOG_WARN("invalid argument", K(ret), K(collation_type));
  } else if (src.empty()) {
    dst.reset();
  } else {
    ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
    size_t buf_len = src.length() * cs->caseup_multiply;
    char *buf = NULL;
    if (OB_ISNULL(buf = static_cast<char *>(allocator.alloc(buf_len)))) {
      ret = OB_ALLOCATE_MEMORY_FAILED;
      LOG_WARN("fail to allocate memory", K(ret));

    } else if (charset_type_by_coll(collation_type) == CHARSET_GB18030 ||
               charset_type_by_coll(collation_type) == CHARSET_GB18030_2022) {
      size_t dst_len = caseup(collation_type, (char*)src.ptr(), src.length(), buf, buf_len);
      dst.assign_ptr(buf, static_cast<int32_t>(dst_len));
    } else {
      if (OB_FAIL(ob_write_string(allocator, src, dst))) {
        LOG_WARN("fail to write string", K(ret));
      } else {
        ObCollationType col_type = (charset_type_by_coll(collation_type) == CHARSET_BINARY) ?
                                      ObCollationType::CS_TYPE_UTF8MB4_BIN : collation_type;
        size_t dst_len = caseup(col_type, dst.ptr(), dst.length(), dst.ptr(), dst.length());
        dst.set_length(static_cast<int32_t>(dst_len));
      }
    }
  }
  return ret;
}

/**
 * @brief allocate new buf and do case down
 */
int ObCharset::casedn(const ObCollationType collation_type,
                      const ObString &src,
                      ObString &dst,
                      ObIAllocator &allocator)
{
  int ret = OB_SUCCESS;

  if (OB_UNLIKELY(!is_valid_collation(collation_type))) {
    ret = OB_INVALID_ARGUMENT;
    LOG_WARN("invalid argument", K(ret), K(collation_type));
  } else if (src.empty()) {
    dst.reset();
  } else {
    ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
    size_t buf_len = src.length() * cs->caseup_multiply;
    char *buf = NULL;
    if (OB_ISNULL(buf = static_cast<char *>(allocator.alloc(buf_len)))) {
      ret = OB_ALLOCATE_MEMORY_FAILED;
      LOG_WARN("fail to allocate memory", K(ret));

    } else if (charset_type_by_coll(collation_type) == CHARSET_GB18030 ||
               charset_type_by_coll(collation_type) == CHARSET_GB18030_2022) {
      size_t dst_len = casedn(collation_type, (char*)src.ptr(), src.length(), buf, buf_len);
      dst.assign_ptr(buf, static_cast<int32_t>(dst_len));
    } else {
      if (OB_FAIL(ob_write_string(allocator, src, dst))) {
        LOG_WARN("fail to write string", K(ret));
      } else {
        ObCollationType col_type = (charset_type_by_coll(collation_type) == CHARSET_BINARY) ?
                                      ObCollationType::CS_TYPE_UTF8MB4_BIN : collation_type;
        size_t dst_len = casedn(col_type, dst.ptr(), dst.length(), dst.ptr(), dst.length());
        dst.set_length(static_cast<int32_t>(dst_len));
      }
    }
  }
  return ret;
}

#define OB_MAX_WEIGHT  OB_MAX_VARCHAR_LENGTH
size_t ObCharset::sortkey(ObCollationType collation_type,
                          const char *str,
                          int64_t str_len,
                          char *key,
                          int64_t key_len,
                          bool &is_valid_unicode)
{
  size_t result = 0;
  bool is_valid_unicode_tmp = 0;
  if (is_argument_valid(collation_type, str, str_len, key, key_len)) {
    ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);

    // compare_collation_free函数已经能自动过滤尾部空格了，sortkey中过滤空格的逻辑不需要了

    // is_valid_unicode参数的作用如下：
    // 以一个例子说明，待比较的字符串为：
    //
    // 第一个字符串：0x2c 0x80
    // 第二个字符串：0x2c 0x80 0x20
    //
    // 如果不采用sortkey转换后的字符串比较，会认为0x80及之后的字符为非法的unicode字符，对这之后的字符串采用二进制比较，则认为第二个字符串更大。
    //
    // 而采用sortkey转换后的字符串，则在碰到0x80非法字符之后，就停止转换，导致认为比较结果相等。
    // 修复方案：
    //
    // 对于有非法字符的unicode字符串，采用原生的不转换sortkey的方式进行比较。
    result = cs->coll->strnxfrm(cs,
                             reinterpret_cast<unsigned char *>(key),
                             key_len,
                             OB_MAX_WEIGHT,
                             reinterpret_cast<const unsigned char *>(str),
                             str_len,
                             0,
                             &is_valid_unicode_tmp);
    is_valid_unicode = is_valid_unicode_tmp;
  }
  return result;
}

size_t ObCharset::sortkey_var_len(ObCollationType collation_type,
                          const char *str,
                          int64_t str_len,
                          char *key,
                          int64_t key_len,
                          bool is_space_cmp,
                          bool &is_valid_unicode)
{
  size_t result = 0;
  bool is_valid_unicode_tmp = 0;
  if (is_argument_valid(collation_type, str, str_len, key, key_len)) {
    ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);

    // 对于有非法字符的unicode字符串，采用原生的不转换sortkey的方式进行比较。
    if (cs->coll->strnxfrm_varlen == NULL) {
      result = -1;
    } else {
      result = cs->coll->strnxfrm_varlen(cs,
                                       reinterpret_cast<unsigned char *>(key),
                                       key_len,
                                       OB_MAX_WEIGHT,
                                       reinterpret_cast<const unsigned char *>(str),
                                       str_len,
                                       is_space_cmp,
                                       &is_valid_unicode_tmp);
      is_valid_unicode = is_valid_unicode_tmp;
    }
  }
  return result;
}

uint64_t ObCharset::hash(ObCollationType collation_type,
                         const char *str,
                         int64_t str_len,
                         uint64_t seed,
                         const bool calc_end_space,
                         hash_algo hash_algo)
{
  uint64_t ret = seed;
  if (is_argument_valid(collation_type, str, str_len, NULL, 0)) {
    // since hash_sort() of MY_COLLATION_HANDLER need two intergers, one for input and output as
    // result, the other only for input as random seed, so I find 0xc6a4a7935bd1e995 from
    // murmurhash64A(), U can also find similar usage too.

    ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
    if (OB_ISNULL(cs->coll)) {
      LOG_WARN("unexpected error. invalid argument(s)", K(cs), K(cs->coll), K(lbt()));
    } else {
      seed = 0xc6a4a7935bd1e995;
      cs->coll->hash_sort(cs, reinterpret_cast<const unsigned char *>(str), str_len,
                          &ret, &seed, calc_end_space, hash_algo);
    }
  }
  return ret;
}

/* only called by unit test for now, is_oracle_mode will always return false in unit test
 *  if you want to use this hash fun in other places, please contact @maoli */
uint64_t ObCharset::hash(ObCollationType collation_type,
                         const char *str,
                         int64_t str_len,
                         uint64_t seed,
                         hash_algo hash_algo) {
  return hash(collation_type, str, str_len, seed, lib::is_oracle_mode(), hash_algo);
}

int ObCharset::like_range(ObCollationType collation_type,
                          const ObString &like_str,
                          char escape,
                          char *min_str,
                          size_t *min_str_len,
                          char *max_str,
                          size_t *max_str_len)
{
  int ret = OB_SUCCESS;
  if (OB_UNLIKELY(collation_type <= CS_TYPE_INVALID ||
                  collation_type >= CS_TYPE_MAX) ||
                  OB_ISNULL(min_str) ||
                  OB_ISNULL(min_str_len) ||
                  OB_ISNULL(max_str) ||
                  OB_ISNULL(max_str_len)) {
    ret = OB_ERR_UNEXPECTED;
    LOG_WARN("unexpected error. invalid argument(s)",
              K(ret),
              K(collation_type),
              KP(max_str), K(max_str_len),
              KP(min_str), K(min_str_len));
  } else if (OB_ISNULL(ObCharset::charset_arr[collation_type])) {
    ret = OB_NOT_SUPPORTED;
    LOG_WARN("unsupported charset or collation", K(ret), K(collation_type));
  } else {
    ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
    static char w_one = '_';
    static char w_many = '%';
   // const char *tmp_str = like_str.ptr();
    // 'abc%' -> real_len=3
	  // like_range里面会对min_str做字符填充, ('abc\min\min..', 'abc\max\max..')
	  // 存储层那边比较的时候会有字节比较的情况，导致判断出'abc'不在这个范围内
	  // 所以这里将start的长度修改为填充前的长度，变为('abc','abc\max\max\max..')
    //    size_t real_len = like_str.length();
    //    size_t cur_len = 0;
    //    while (cur_len < like_str.length()
    //            && *(tmp_str + cur_len) != w_many
    //            && *(tmp_str + cur_len) != w_one) {
    //      ++cur_len;
    //    }
    //    real_len = cur_len;
	//    上面的修改会引发这样的问题：'a\0' 会不在范围内，因为mysql的utf8特性使得'a\0' < 'a'，所以范围不能这么修改
	//    具体的修正还是由存储层来做
    size_t res_size = *min_str_len < *max_str_len ? *min_str_len : *max_str_len;
    if (OB_ISNULL(cs->coll)) {
      ret = OB_ERR_UNEXPECTED;
      LOG_WARN("unexpected error. invalid argument(s)", K(cs), K(cs->coll));
    } else if (0 != cs->coll->like_range(cs,
                                  like_str.ptr(),
                                  like_str.length(),
                                  escape,
                                  w_one,
                                  w_many,
                                  res_size,
                                  min_str,
                                  max_str,
                                  min_str_len,
                                  max_str_len)) {
      ret = OB_EMPTY_RANGE;
    } else {
     // *min_str_len = real_len;
    }
  }

  return ret;
}

size_t ObCharset::strlen_char(const ObCollationType collation_type,
                              const char *str,
                              int64_t str_len)
{
  size_t ret = 0;
  if (OB_UNLIKELY(collation_type <= CS_TYPE_INVALID ||
                  collation_type >= CS_TYPE_MAX) ||
                  OB_ISNULL(ObCharset::charset_arr[collation_type])) {
    LOG_WARN("unexpected error. invalid argument(s)", K(collation_type), K(lbt()));
  } else {
    ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
    if (OB_ISNULL(cs->cset)) {
      LOG_WARN("unexpected error. invalid argument(s)", K(cs), K(cs->cset), K(lbt()));
    } else {
      ret = cs->cset->numchars(cs, str, str + str_len);
    }
  }
  return ret;
}

size_t ObCharset::strlen_byte_no_sp(const ObCollationType collation_type,
                                    const char *str,
                                    int64_t str_len)
{
  size_t ret = 0;
  if (OB_UNLIKELY(collation_type <= CS_TYPE_INVALID ||
                  collation_type >= CS_TYPE_MAX) ||
                  OB_ISNULL(ObCharset::charset_arr[collation_type])) {
    LOG_WARN("unexpected error. invalid argument(s)", K(collation_type), K(lbt()));
  } else {
    ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
    if (OB_ISNULL(cs->cset)) {
      LOG_WARN("unexpected error. invalid argument(s)", K(cs), K(cs->cset), K(lbt()));
    } else {
      ret = cs->cset->lengthsp(cs, str, str_len);
    }
  }
  return ret;
}

int ObCharset::well_formed_len(ObCollationType collation_type, const char *str,
                           int64_t str_len, int64_t &well_formed_len)
{
  int ret = OB_SUCCESS;
  if (OB_UNLIKELY(collation_type <= CS_TYPE_INVALID ||
                  collation_type >= CS_TYPE_MAX)) {
    ret = OB_ERR_UNEXPECTED;
    LOG_WARN("unexpected error. invalid argument(s)", K(collation_type), K(lbt()));
  } else if (OB_ISNULL(ObCharset::charset_arr[collation_type])) {
    ret = OB_NOT_SUPPORTED;
    LOG_WARN("unsupported charset or collation", K(ret), K(collation_type));
  } else if (OB_UNLIKELY(NULL == str && 0 != str_len)) {
    ret = OB_INVALID_ARGUMENT;
    LOG_WARN("invalid argument, str is null  and  str_len is nonzero",
             KP(str), K(str_len), K(ret));
  } else if (str_len > 0) {
    ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
    if (OB_ISNULL(cs->cset)) {
      ret = OB_ERR_UNEXPECTED;
      LOG_WARN("unexpected error. invalid argument(s)", K(cs), K(cs->cset), K(lbt()));
    } else {
      int32_t error = 0;
      well_formed_len = cs->cset->well_formed_len(cs, str, str + str_len, UINT64_MAX, &error);
      if (0 != error) {
        ret = OB_ERR_INCORRECT_STRING_VALUE;
        LOG_WARN("well_formed_len failed. invalid char found",
                 K(ret), K(error), "str", ObString(str_len, str), KPHEX(str, str_len));
      }
    }
  } else {
    well_formed_len = 0;
  }
  return ret;
}


int ObCharset::well_formed_len(ObCollationType collation_type, const char *str,
                           int64_t str_len, int64_t &well_formed_len, int32_t &well_formed_error)
{
  int ret = OB_SUCCESS;
  if (OB_UNLIKELY(collation_type <= CS_TYPE_INVALID ||
                  collation_type >= CS_TYPE_MAX)) {
    ret = OB_ERR_UNEXPECTED;
    LOG_WARN("unexpected error. invalid argument(s)", K(collation_type));
  } else if (OB_ISNULL(ObCharset::charset_arr[collation_type])) {
    ret = OB_NOT_SUPPORTED;
    LOG_WARN("unsupported charset or collation", K(ret), K(collation_type));
  } else if (OB_UNLIKELY(NULL == str && 0 != str_len)) {
    ret = OB_INVALID_ARGUMENT;
    LOG_WARN("invalid argument, str is null  and  str_len is nonzero",
             KP(str), K(str_len), K(ret));
  } else {
    ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
    if (OB_ISNULL(cs->cset)) {
      ret = OB_ERR_UNEXPECTED;
      LOG_WARN("unexpected error. invalid argument(s)", K(cs), K(cs->cset));
    } else {
      well_formed_len = cs->cset->well_formed_len(cs, str, str + str_len, UINT64_MAX, &well_formed_error);
    }
  }
  return ret;
}

size_t ObCharset::charpos(const ObCollationType collation_type,
                              const char *str,
                              const int64_t str_len,
                              const int64_t length,
                              int *ret)
{
  size_t res_pos = 0;
  if (OB_UNLIKELY(collation_type <= CS_TYPE_INVALID ||
                  collation_type >= CS_TYPE_MAX) ||
                  OB_ISNULL(ObCharset::charset_arr[collation_type])) {
    LOG_WARN_RET(OB_INVALID_ARGUMENT, "unexpected error. invalid argument(s)", K(collation_type), K(lbt()));
  } else {
    ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
    if (OB_ISNULL(cs->cset)) {
      LOG_WARN_RET(OB_INVALID_ARGUMENT, "unexpected error. invalid argument(s)", K(cs), K(cs->cset), K(lbt()));
    } else {
      res_pos = cs->cset->charpos(cs, str, str + str_len, length);
      if (res_pos > str_len) {
        res_pos = str_len;
        if (OB_NOT_NULL(ret)) {
          *ret = OB_ERROR_OUT_OF_RANGE;
        }
      }
    }
  }
  return res_pos;
}

size_t ObCharset::max_bytes_charpos(const ObCollationType collation_type,
                              const char *str,
                              const int64_t str_len,
                              const int64_t max_bytes,
                              int64_t &char_len)
{
  size_t ret = 0;
  if (OB_UNLIKELY(collation_type <= CS_TYPE_INVALID ||
                  collation_type >= CS_TYPE_MAX) ||
                  OB_ISNULL(ObCharset::charset_arr[collation_type])) {
    LOG_WARN("unexpected error. invalid argument(s)", K(collation_type), K(lbt()));
  } else {
    ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
    if (OB_ISNULL(cs->cset)) {
      LOG_WARN("unexpected error. invalid argument(s)", K(cs), K(cs->cset), K(lbt()));
    } else {
      size_t char_len_tmp = 0;
      ret = cs->cset->max_bytes_charpos(cs, str, str + str_len, max_bytes, &char_len_tmp);
      char_len = char_len_tmp;
    }
  }
  return ret;
}

bool ObCharset::wildcmp(ObCollationType collation_type,
                       const ObString &str,
                       const ObString &wildstr,
                       int32_t escape, int32_t w_one, int32_t w_many)
{
  bool ret = false;
  if (OB_UNLIKELY(collation_type <= CS_TYPE_INVALID ||
                  collation_type >= CS_TYPE_MAX) ||
                  OB_ISNULL(ObCharset::charset_arr[collation_type])) {
    LOG_WARN("unexpected error. invalid argument(s)", K(collation_type), K(lbt()));
  } else {
    ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
    if (OB_ISNULL(cs->coll)) {
      LOG_WARN("unexpected error. invalid argument(s)", K(cs), K(cs->coll), K(lbt()));
    } else {
      int tmp = cs->coll->wildcmp(cs, str.ptr(), str.ptr() + str.length(),
                                wildstr.ptr(), wildstr.ptr() + wildstr.length(),
                                escape, w_one, w_many);
      /*
      **	0 if matched
      **	-1 if not matched with wildcard
      **	 1 if matched with wildcard
      */
      ret = (0 == tmp);
    }
  }
  return ret;
}

int ObCharset::mb_wc(ObCollationType collation_type,
                      const ObString &mb, int32_t &wc)
{
  int ret = OB_SUCCESS;
  if (OB_UNLIKELY(collation_type <= CS_TYPE_INVALID ||
                  collation_type >= CS_TYPE_MAX)) {
    ret = OB_ERR_UNEXPECTED;
    LOG_WARN("unexpected error. invalid argument(s)",
              K(ret), K(collation_type));
  } else if (OB_ISNULL(ObCharset::charset_arr[collation_type])) {
    ret = OB_NOT_SUPPORTED;
    LOG_WARN("unsupported charset or collation", K(ret), K(collation_type));
  } else {
    ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
    ob_wc_t my_wc;
    if (OB_ISNULL(cs->cset)) {
      ret = OB_ERR_UNEXPECTED;
      LOG_WARN("unexpected error. invalid argument(s)", K(cs), K(cs->cset));
    } else {
      int tmp = cs->cset->mb_wc(cs, &my_wc, reinterpret_cast<const unsigned char*>(mb.ptr()),
                            reinterpret_cast<const unsigned char*>(mb.ptr()+mb.length()));
      if (tmp <= 0) {
        ret = OB_ERR_INCORRECT_STRING_VALUE;
      } else {
        ret = OB_SUCCESS;
        wc = static_cast<int32_t>(my_wc);
      }
    }
  }
  return ret;
}

int ObCharset::mb_wc(ObCollationType collation_type,
                     const char *mb,
                     const int64_t mb_size,
                     int32_t &length,
                     int32_t &wc)
{
  int ret = OB_SUCCESS;
  if (OB_UNLIKELY(collation_type <= CS_TYPE_INVALID ||
                  collation_type >= CS_TYPE_MAX)) {
    ret = OB_ERR_UNEXPECTED;
    LOG_WARN("unexpected error. invalid argument(s)",
              K(ret), K(collation_type));
  } else if (OB_ISNULL(ObCharset::charset_arr[collation_type])) {
    ret = OB_NOT_SUPPORTED;
    LOG_WARN("unsupported charset or collation", K(ret), K(collation_type));
  } else {
    ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
    ob_wc_t my_wc;
    if (OB_ISNULL(cs->cset)) {
      ret = OB_ERR_UNEXPECTED;
      LOG_WARN("unexpected error. invalid argument(s)", K(cs), K(cs->cset));
    } else {
      int tmp = cs->cset->mb_wc(cs, &my_wc, reinterpret_cast<const unsigned char*>(mb),
                                reinterpret_cast<const unsigned char*>(mb + mb_size));
      if (tmp <= 0) {
        ret = OB_ERR_INCORRECT_STRING_VALUE;
      } else {
        ret = OB_SUCCESS;
        wc = static_cast<int32_t>(my_wc);
        length = static_cast<int32_t>(tmp);
      }
    }
  }
  return ret;
}

int ObCharset::display_len(ObCollationType collation_type,
                           const ObString &mb, int64_t &width)
{
  int ret = OB_SUCCESS;
  width = 0;
  if (OB_UNLIKELY(collation_type <= CS_TYPE_INVALID ||
                  collation_type >= CS_TYPE_MAX)) {
    ret = OB_ERR_UNEXPECTED;
    LOG_WARN("unexpected error. invalid argument(s)",
              K(ret), K(collation_type));
  } else if (OB_ISNULL(ObCharset::charset_arr[collation_type])) {
    ret = OB_NOT_SUPPORTED;
    LOG_WARN("unsupported charset or collation", K(ret), K(collation_type));
  } else {
    ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
    if (OB_ISNULL(cs->cset)) {
      ret = OB_ERR_UNEXPECTED;
      LOG_WARN("unexpected error. invalid argument(s)", K(cs), K(cs->cset));
    } else {
      const unsigned char *buf = reinterpret_cast<const unsigned char*>(mb.ptr());
      int64_t buf_size = mb.length();
      int64_t char_pos = 0;
      bool found = false;

      while (OB_SUCC(ret) && char_pos < buf_size && !found) {
        ob_wc_t wc;
        int bytes = cs->cset->mb_wc(cs, &wc, buf + char_pos, buf + buf_size);

        if (bytes < 0) {
          found = true;
        } else {
          int w = 0;
          if (bytes > OB_CS_ILSEQ) {
            w = ObCharset::is_cjk_charset(collation_type) ? mk_wcwidth_cjk(wc) : mk_wcwidth(wc);
          }
          if (w <= 0) {
            w = 1;
          }
          if (OB_CS_ILSEQ == bytes) {
            bytes = 1;
          }
          if (char_pos + bytes <= buf_size) {
            width += w;
            char_pos += bytes;
          } else {
            found = true;
          }
        }
      }
    }
  }
  return ret;
}

int ObCharset::max_display_width_charpos(ObCollationType collation_type, const char *mb, const int64_t mb_size,
                                         const int64_t max_width, int64_t &char_pos, int64_t *total_width_ret)
{
  int ret = OB_SUCCESS;
  if (OB_UNLIKELY(collation_type <= CS_TYPE_INVALID ||
                  collation_type >= CS_TYPE_MAX)) {
    ret = OB_ERR_UNEXPECTED;
    LOG_WARN("unexpected error. invalid argument(s)",
              K(ret), K(collation_type));
  } else if (OB_ISNULL(ObCharset::charset_arr[collation_type])) {
    ret = OB_NOT_SUPPORTED;
    LOG_WARN("unsupported charset or collation", K(ret), K(collation_type));
  } else {
    ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
    if (OB_ISNULL(cs->cset)) {
      ret = OB_ERR_UNEXPECTED;
      LOG_WARN("unexpected error. invalid argument(s)", K(cs), K(cs->cset));
    } else {
      char_pos = 0;
      const unsigned char *buf = reinterpret_cast<const unsigned char*>(mb);
      bool found = false;
      int64_t total_width = 0;

      while (OB_SUCC(ret) && char_pos < mb_size && !found) {
        ob_wc_t wc;
        int bytes = cs->cset->mb_wc(cs, &wc, buf + char_pos, buf + mb_size);

        if (bytes < 0) { // remain buf is too smalll
          found = true;
        } else {
          int w = 0;
          if (bytes > OB_CS_ILSEQ) {
            w = ObCharset::is_cjk_charset(collation_type) ? mk_wcwidth_cjk(wc) : mk_wcwidth(wc);
          }
          if (w <= 0) {
            w = 1;
          }
          if (OB_CS_ILSEQ == bytes) {
            bytes = 1;
          }
          if (char_pos + bytes <= mb_size && total_width + w <= max_width) {
            total_width += w;
            char_pos += bytes;
          } else {
            found = true;
          }
        }
      }

      if (OB_SUCC(ret) && NULL != total_width_ret) {
        *total_width_ret = total_width;
      }
    }
  }
  return ret;
}


int ObCharset::wc_mb(ObCollationType collation_type, int32_t wc, char *buff, int32_t buff_len, int32_t &length)
{
  int ret = OB_SUCCESS;
  if (OB_UNLIKELY(collation_type <= CS_TYPE_INVALID || collation_type >= CS_TYPE_MAX)) {
    ret = OB_ERR_UNEXPECTED;
    LOG_WARN("unexpected error. invalid argument(s)", K(ret), K(collation_type));
  } else if (OB_ISNULL(ObCharset::charset_arr[collation_type])) {
    ret = OB_NOT_SUPPORTED;
    LOG_WARN("unsupported charset or collation", K(ret), K(collation_type));
  } else {
    ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
    if (OB_ISNULL(cs) || OB_ISNULL(cs->cset)) {
      ret = OB_ERR_UNEXPECTED;
      LOG_WARN("unexpected error. invalid argument(s)", K(cs), K(ret));
    } else {
      int tmp = cs->cset->wc_mb(cs, wc, reinterpret_cast<unsigned char*>(buff),
                                reinterpret_cast<unsigned char*>(buff + buff_len));
      if (tmp <= 0) {
        ret = OB_ERR_INCORRECT_STRING_VALUE;
      } else {
        ret = OB_SUCCESS;
        length = tmp;
      }
    }
  }
  return ret;
}

const char *ObCharset::charset_name(ObCharsetType charset_type)
{
  const char *ret_name = "invalid_type";
  switch(charset_type) {
    case CHARSET_BINARY: {
      ret_name = "binary";
      break;
    }
    case CHARSET_UTF8MB4: {
      ret_name = "utf8mb4";
      break;
    }
    case CHARSET_GBK: {
      ret_name = "gbk";
      break;
    }
    case CHARSET_UTF16: {
      ret_name = "utf16";
      break;
    }
    case CHARSET_GB18030: {
      ret_name = "gb18030";
      break;
    }
    case CHARSET_LATIN1: {
      ret_name = "latin1";
      break;
    }
    case CHARSET_GB18030_2022: {
      ret_name = "gb18030_2022";
      break;
    }
    default: {
      break;
    }
  }
  return ret_name;
}

const char *ObCharset::charset_name(ObCollationType collation_type)
{
  return charset_name(charset_type_by_coll(collation_type));
}

const char *ObCharset::collation_name(ObCollationType collation_type)
{
  ObCharsetInfo *cs = NULL;
  if (collation_type < CS_TYPE_MAX && collation_type >= CS_TYPE_INVALID) {
    cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
  }
  return (NULL == cs) ? "invalid_type" : cs->name;
}

int ObCharset::check_valid_implicit_convert(ObCollationType src_type, ObCollationType dst_type)
{
  int ret = OB_SUCCESS;
  ObCharsetType src_cs = ObCharset::charset_type_by_coll(src_type);
  ObCharsetType dst_cs = ObCharset::charset_type_by_coll(dst_type);
  if ((src_cs == CHARSET_GB18030 && dst_cs == CHARSET_GB18030_2022) ||
      (src_cs == CHARSET_GB18030_2022 && dst_cs == CHARSET_GB18030)) {
    ret = OB_CANT_AGGREGATE_2COLLATIONS;
    LOG_WARN("implict cast between GB18030 and GB18030_2022 not allowed", K(ret));
  }
  return ret;
}

int ObCharset::collation_name(ObCollationType collation_type, ObString &coll_name)
{
  int ret = OB_SUCCESS;
  ObCharsetInfo *charset_info = NULL;
  if (collation_type < CS_TYPE_MAX && collation_type >= CS_TYPE_INVALID) {
    charset_info = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
  }
  if (OB_ISNULL(charset_info)) {
    ret = OB_INVALID_ARGUMENT;
    LOG_WARN("invalid collation type", K(ret), K(collation_type));
  } else {
    coll_name = ObString(charset_info->name);
  }
  return ret;
}

const char* ObCharset::collation_level(const ObCollationLevel cs_level)
{
  const char* ret = "unknown_collation_level";
  switch(cs_level) {
  case CS_LEVEL_EXPLICIT: {
      ret = "EXPLICIT";
      break;
    }
  case CS_LEVEL_NONE: {
      ret = "NONE";
      break;
    }
  case CS_LEVEL_IMPLICIT: {
      ret = "IMPLICIT";
      break;
    }
  case CS_LEVEL_SYSCONST: {
      ret = "SYSCONST";
      break;
    }
  case CS_LEVEL_COERCIBLE: {
      ret = "COERCIBLE";
      break;
    }
  case CS_LEVEL_NUMERIC: {
      ret = "NUMERIC";
      break;
    }
  case CS_LEVEL_IGNORABLE: {
      ret = "IGNORABLE";
      break;
    }
  case CS_LEVEL_INVALID: {
      ret = "INVALID";
      break;
    }
  default: {
      break;
    }
  }
  return ret;
}


ObCharsetType ObCharset::charset_type(const ObString &cs_name)
{
  ObCharsetType charset_type = CHARSET_INVALID;
  if (0 == cs_name.case_compare("utf8")) {
    // utf8是utf8mb4的别名
    charset_type = CHARSET_UTF8MB4;
  } else if (0 == cs_name.case_compare(ob_charset_utf8mb4_bin.csname)) {
    charset_type = CHARSET_UTF8MB4;
  } else if (0 == cs_name.case_compare(ob_charset_bin.csname)) {
    charset_type = CHARSET_BINARY;
  } else if (0 == cs_name.case_compare(ob_charset_gbk_bin.csname)) {
    charset_type = CHARSET_GBK;
  } else if (0 == cs_name.case_compare(ob_charset_utf16_general_ci.csname)) {
    charset_type = CHARSET_UTF16;
  } else if (0 == cs_name.case_compare(ob_charset_gb18030_bin.csname)) {
    charset_type = CHARSET_GB18030;
  } else if (0 == cs_name.case_compare(ob_charset_latin1.csname)) {
    charset_type = CHARSET_LATIN1;
  } else if (0 == cs_name.case_compare(ob_charset_gb18030_2022_bin.csname)) {
    charset_type = CHARSET_GB18030_2022;
  }
  return charset_type;
}

ObCharsetType ObCharset::charset_type_by_name_oracle(const ObString &cs_name)
{
  ObCharsetType charset_type = CHARSET_INVALID;
  if (0 == cs_name.case_compare("AL32UTF8")
      || 0 == cs_name.case_compare("UTF8")) {
    charset_type = CHARSET_UTF8MB4;
  } else if (0 == cs_name.case_compare("AL16UTF16")) {
    charset_type = CHARSET_UTF16;
  } else if (0 == cs_name.case_compare("ZHS16GBK")) {
    charset_type = CHARSET_GBK;
  } else if (0 == cs_name.case_compare("ZHS32GB18030")) {
    charset_type = CHARSET_GB18030;
  } else if (0 == cs_name.case_compare("WE8MSWIN1252")) {
    charset_type = CHARSET_LATIN1;
  } else if (0 == cs_name.case_compare("ZHS32GB18030_2022")) {
    charset_type = CHARSET_GB18030_2022;
  }
  return charset_type;
}

ObCharsetType ObCharset::charset_type(const char *cs_name)
{
  ObCharsetType ct = CHARSET_INVALID;
  if (OB_ISNULL(cs_name)) {
    LOG_WARN_RET(OB_INVALID_ARGUMENT, "unexpected error. invalid argument(s)",
              K(ret), K(ct), KP(cs_name), K(lbt()));
  } else {
    ObString cs_name_str = ObString::make_string(cs_name);
    ct = charset_type(cs_name_str);
  }
  return ct;
}

ObCollationType ObCharset::collation_type(const ObString &cs_name)
{
  ObCollationType collation_type = CS_TYPE_INVALID;
  if (0 == cs_name.case_compare("utf8_bin")) {
    collation_type = CS_TYPE_UTF8MB4_BIN;
  } else if (0 == cs_name.case_compare("utf8_general_ci")) {
    collation_type = CS_TYPE_UTF8MB4_GENERAL_CI;
  } else if (0 == cs_name.case_compare(ob_charset_utf8mb4_bin.name)) {
    collation_type = CS_TYPE_UTF8MB4_BIN;
  } else if (0 == cs_name.case_compare(ob_charset_utf8mb4_general_ci.name)) {
    collation_type = CS_TYPE_UTF8MB4_GENERAL_CI;
  } else if (0 == cs_name.case_compare(ob_charset_bin.name)) {
    collation_type = CS_TYPE_BINARY;
  } else if (0 == cs_name.case_compare(ob_charset_gbk_chinese_ci.name)) {
    collation_type = CS_TYPE_GBK_CHINESE_CI;
  } else if (0 == cs_name.case_compare(ob_charset_gbk_bin.name)) {
    collation_type = CS_TYPE_GBK_BIN;
  } else if (0 == cs_name.case_compare(ob_charset_utf16_general_ci.name)) {
    collation_type = CS_TYPE_UTF16_GENERAL_CI;
  } else if (0 == cs_name.case_compare(ob_charset_utf16_bin.name)) {
    collation_type = CS_TYPE_UTF16_BIN;
  } else if (0 == cs_name.case_compare("utf8_unicode_ci")) {
    collation_type = CS_TYPE_UTF8MB4_UNICODE_CI;
  } else if (0 == cs_name.case_compare(ob_charset_utf16_unicode_ci.name)) {
    collation_type = CS_TYPE_UTF16_UNICODE_CI;
  } else if (0 == cs_name.case_compare(ob_charset_utf8mb4_unicode_ci.name)) {
    collation_type = CS_TYPE_UTF8MB4_UNICODE_CI;
  } else if (0 == cs_name.case_compare(ob_charset_gb18030_bin.name)) {
    collation_type = CS_TYPE_GB18030_BIN;
  } else if (0 == cs_name.case_compare(ob_charset_gb18030_chinese_ci.name)) {
    collation_type = CS_TYPE_GB18030_CHINESE_CI;
  } else if (0 == cs_name.case_compare(ob_charset_latin1_bin.name)) {
    collation_type = CS_TYPE_LATIN1_BIN;
  } else if (0 == cs_name.case_compare(ob_charset_latin1.name)) {
    collation_type = CS_TYPE_LATIN1_SWEDISH_CI;
  } else if (0 == cs_name.case_compare(ob_charset_gb18030_chinese_cs.name)) {
    collation_type = CS_TYPE_GB18030_CHINESE_CS;
  } else if (0 == cs_name.case_compare("any_cs")) {
    collation_type = CS_TYPE_ANY;
  } else if (0 == cs_name.case_compare(ob_charset_gb18030_2022_bin.name)) {
    collation_type = CS_TYPE_GB18030_2022_BIN;
  } else if (0 == cs_name.case_compare(ob_charset_gb18030_2022_pinyin_ci.name)) {
    collation_type = CS_TYPE_GB18030_2022_PINYIN_CI;
  } else if (0 == cs_name.case_compare(ob_charset_gb18030_2022_pinyin_cs.name)) {
    collation_type = CS_TYPE_GB18030_2022_PINYIN_CS;
  } else if (0 == cs_name.case_compare(ob_charset_gb18030_2022_radical_ci.name)) {
    collation_type = CS_TYPE_GB18030_2022_RADICAL_CI;
  } else if (0 == cs_name.case_compare(ob_charset_gb18030_2022_radical_cs.name)) {
    collation_type = CS_TYPE_GB18030_2022_RADICAL_CS;
  } else if (0 == cs_name.case_compare(ob_charset_gb18030_2022_stroke_ci.name)) {
    collation_type = CS_TYPE_GB18030_2022_STROKE_CI;
  } else if (0 == cs_name.case_compare(ob_charset_gb18030_2022_stroke_cs.name)) {
    collation_type = CS_TYPE_GB18030_2022_STROKE_CS;
  }
  return collation_type;
}

ObCollationType ObCharset::collation_type(const char* cs_name)
{
  ObString cs_name_str = ObString::make_string(cs_name);
  return collation_type(cs_name_str);
}

bool ObCharset::is_valid_collation(ObCharsetType charset_type, ObCollationType collation_type)
{
  bool ret = false;
  if (CHARSET_UTF8MB4 == charset_type) {
    if (CS_TYPE_UTF8MB4_BIN == collation_type
        || CS_TYPE_UTF8MB4_GENERAL_CI == collation_type
        || CS_TYPE_UTF8MB4_UNICODE_CI == collation_type
        ) {
      ret = true;
    }
  } else if (CHARSET_BINARY == charset_type
      && CS_TYPE_BINARY == collation_type) {
    ret = true;
  } else if (CHARSET_GBK == charset_type) {
    if (CS_TYPE_GBK_BIN == collation_type || CS_TYPE_GBK_CHINESE_CI == collation_type) {
      ret = true;
    }
  } else if (CHARSET_UTF16 == charset_type) {
    if (CS_TYPE_UTF16_GENERAL_CI == collation_type
        || CS_TYPE_UTF16_BIN == collation_type
        || CS_TYPE_UTF16_UNICODE_CI == collation_type
        ) {
      ret = true;
    }
  } else if (CHARSET_GB18030 == charset_type) {
    if (CS_TYPE_GB18030_CHINESE_CI == collation_type
        || CS_TYPE_GB18030_BIN == collation_type) {
      ret = true;
    }
  } else if (CHARSET_LATIN1 == charset_type) {
    if (CS_TYPE_LATIN1_SWEDISH_CI == collation_type || CS_TYPE_LATIN1_BIN == collation_type) {
      ret = true;
    }
  } else if (CHARSET_GB18030_2022 == charset_type) {
    ret = is_gb18030_2022(collation_type);
  }
  return ret;
}
ObCollationType ObCharset::get_coll_type_by_nlssort_param(ObCharsetType charset_type,
                                                          const ObString &nlssort_param)
{
  ObCollationType coll_type = CS_TYPE_INVALID;
  ObNLSCollation nls_coll_type = NLS_COLLATION_INVALID;
  static ObCollationType bin_coll_map[CHARSET_MAX] = {
    CS_TYPE_INVALID,
    CS_TYPE_BINARY,
    CS_TYPE_UTF8MB4_BIN,
    CS_TYPE_GBK_BIN,
    CS_TYPE_UTF16_BIN,
    CS_TYPE_GB18030_BIN,
    CS_TYPE_LATIN1_BIN,
    CS_TYPE_GB18030_2022_BIN,
  };
  static ObCollationType non_bin_coll_marks[NLS_COLLATION_MAX] = {
    CS_TYPE_INVALID,
    CS_TYPE_PINYIN_BEGIN_MARK,
    CS_TYPE_RADICAL_BEGIN_MARK,
    CS_TYPE_STROKE_BEGIN_MARK,
  };
  if (0 == nlssort_param.case_compare("SCHINESE_PINYIN_M")) {
    nls_coll_type = NLS_COLLATION_SCHINESE_PINYIN_M;
  } else if (0 == nlssort_param.case_compare("SCHINESE_PINYIN2_M")) {
    nls_coll_type = NLS_COLLATION_SCHINESE_PINYIN2_M;
  } else if (0 == nlssort_param.case_compare("SCHINESE_RADICAL2_M")) {
    nls_coll_type = NLS_COLLATION_SCHINESE_RADICAL2_M;
  } else if (0 == nlssort_param.case_compare("SCHINESE_STROKE2_M")) {
    nls_coll_type = NLS_COLLATION_SCHINESE_STROKE2_M;
  } else if (0 == nlssort_param.case_compare("UCA0900_SCHINESE_PINYIN")) {
    nls_coll_type = NLS_COLLATION_SCHINESE_PINYIN_900;
  } else if (0 == nlssort_param.case_compare("UCA0900_SCHINESE_RADICAL")) {
    nls_coll_type = NLS_COLLATION_SCHINESE_RADICAL_900;
  } else if (0 == nlssort_param.case_compare("UCA0900_SCHINESE_STROKE")) {
    nls_coll_type = NLS_COLLATION_SCHINESE_STROKE_900;
  } else if (0 == nlssort_param.case_compare("BINARY")) {
    nls_coll_type = NLS_COLLATION_BINARY;
  }
  if (is_valid_nls_collation(nls_coll_type) && is_valid_charset(charset_type)) {
    if (NLS_COLLATION_BINARY == nls_coll_type) {
      coll_type = bin_coll_map[charset_type];
    } else if (nls_coll_type == NLS_COLLATION_SCHINESE_PINYIN_M) {
      coll_type = CS_TYPE_GB18030_CHINESE_CS;
    } else if (nls_coll_type == NLS_COLLATION_SCHINESE_PINYIN2_M) {
      coll_type = CS_TYPE_GB18030_2022_PINYIN_CS;
    } else if (nls_coll_type == NLS_COLLATION_SCHINESE_RADICAL2_M) {
      coll_type = CS_TYPE_GB18030_2022_RADICAL_CS;
    } else if (nls_coll_type == NLS_COLLATION_SCHINESE_STROKE2_M) {
      coll_type = CS_TYPE_GB18030_2022_STROKE_CS;
    } else {
      if (charset_type != CHARSET_LATIN1) {
        coll_type = static_cast<ObCollationType>(
              non_bin_coll_marks[nls_coll_type] + (charset_type - CHARSET_BINARY));
      }
    }
  }
  return coll_type;
}

bool ObCharset::is_valid_collation(int64_t collation_type_int)
{
  ObCollationType collation_type = static_cast<ObCollationType>(collation_type_int);
  return CS_TYPE_UTF8MB4_GENERAL_CI == collation_type
    || CS_TYPE_UTF8MB4_BIN == collation_type
    || CS_TYPE_BINARY == collation_type
    || CS_TYPE_GBK_BIN == collation_type
    || CS_TYPE_GBK_CHINESE_CI == collation_type
    || CS_TYPE_UTF16_BIN == collation_type
    || CS_TYPE_UTF16_GENERAL_CI == collation_type
    || CS_TYPE_GB18030_BIN == collation_type
    || CS_TYPE_GB18030_CHINESE_CI == collation_type
    || CS_TYPE_GB18030_CHINESE_CS == collation_type
    || CS_TYPE_LATIN1_SWEDISH_CI == collation_type
    || CS_TYPE_LATIN1_BIN == collation_type
    || is_gb18030_2022(collation_type)
    || CS_TYPE_UTF8MB4_UNICODE_CI == collation_type
    || CS_TYPE_UTF16_UNICODE_CI == collation_type
    || (CS_TYPE_EXTENDED_MARK < collation_type && collation_type < CS_TYPE_MAX)
    ;
}

ObCharsetType ObCharset::charset_type_by_coll(ObCollationType collation_type)
{
  ObCharsetType charset_type = CHARSET_INVALID;
  switch(collation_type) {
    case CS_TYPE_UTF8MB4_GENERAL_CI:
      //fall through
    case CS_TYPE_UTF8MB4_BIN:
    case CS_TYPE_UTF8MB4_ZH_0900_AS_CS:
    case CS_TYPE_UTF8MB4_ZH2_0900_AS_CS:
    case CS_TYPE_UTF8MB4_ZH3_0900_AS_CS:
    case CS_TYPE_UTF8MB4_UNICODE_CI: {
      charset_type = CHARSET_UTF8MB4;
      break;
    }
    case CS_TYPE_BINARY: {
      charset_type = CHARSET_BINARY;
      break;
    }
    case CS_TYPE_GBK_CHINESE_CI:
    case CS_TYPE_GBK_ZH_0900_AS_CS:
    case CS_TYPE_GBK_ZH2_0900_AS_CS:
    case CS_TYPE_GBK_ZH3_0900_AS_CS:
    case CS_TYPE_GBK_BIN: {
      charset_type = CHARSET_GBK;
      break;
    }
    case CS_TYPE_UTF16_BIN:
    case CS_TYPE_UTF16_ZH_0900_AS_CS:
    case CS_TYPE_UTF16_ZH2_0900_AS_CS:
    case CS_TYPE_UTF16_ZH3_0900_AS_CS:
    case CS_TYPE_UTF16_GENERAL_CI:
    case CS_TYPE_UTF16_UNICODE_CI: {
      charset_type = CHARSET_UTF16;
      break;
    }
    case CS_TYPE_GB18030_ZH_0900_AS_CS:
    case CS_TYPE_GB18030_ZH2_0900_AS_CS:
    case CS_TYPE_GB18030_ZH3_0900_AS_CS:
    case CS_TYPE_GB18030_CHINESE_CS:
    case CS_TYPE_GB18030_CHINESE_CI:
    case CS_TYPE_GB18030_BIN: {
      charset_type = CHARSET_GB18030;
      break;
    }
    case CS_TYPE_LATIN1_SWEDISH_CI:
    case CS_TYPE_LATIN1_BIN: {
      charset_type = CHARSET_LATIN1;
      break;
    }
    case CS_TYPE_GB18030_2022_BIN:
    case CS_TYPE_GB18030_2022_PINYIN_CI:
    case CS_TYPE_GB18030_2022_PINYIN_CS:
    case CS_TYPE_GB18030_2022_RADICAL_CI:
    case CS_TYPE_GB18030_2022_RADICAL_CS:
    case CS_TYPE_GB18030_2022_STROKE_CI:
    case CS_TYPE_GB18030_2022_STROKE_CS:
    case CS_TYPE_GB18030_2022_ZH_0900_AS_CS:
    case CS_TYPE_GB18030_2022_ZH2_0900_AS_CS:
    case CS_TYPE_GB18030_2022_ZH3_0900_AS_CS: {
      charset_type = CHARSET_GB18030_2022;
      break;
    }
    default: {
      break;
    }
  }
  return charset_type;
}

ObNlsCharsetId ObCharset::charset_type_to_ora_charset_id(ObCharsetType cs_type)
{
  ObNlsCharsetId cs_id = CHARSET_INVALID_ID;
  switch (cs_type)
  {
  case CHARSET_UTF8MB4:
    cs_id = CHARSET_AL32UTF8_ID;
    break;
  case CHARSET_GBK:
    cs_id = CHARSET_ZHS16GBK_ID;
    break;
  case CHARSET_GB18030:
    cs_id = CHARSET_ZHS32GB18030_ID;
    break;
  case CHARSET_UTF16:
    cs_id = CHARSET_AL16UTF16_ID;
    break;
  case CHARSET_LATIN1:
    cs_id = CHARSET_WE8MSWIN1252_ID;
    break;
  case CHARSET_GB18030_2022:
    cs_id = CHARSET_ZHS32GB18030_2022_ID;
    break;
  default:
    break;
  }
  return cs_id;
}

ObCharsetType ObCharset::ora_charset_type_to_charset_type(ObNlsCharsetId charset_id)
{
  ObCharsetType cs_type = CHARSET_INVALID;
  switch (charset_id)
  {
    case CHARSET_AL32UTF8_ID:
    cs_type = CHARSET_UTF8MB4;
    break;
    case CHARSET_ZHS16GBK_ID:
    cs_type = CHARSET_GBK;
    break;
    case CHARSET_ZHS32GB18030_ID:
    cs_type = CHARSET_GB18030;
    break;
    case CHARSET_AL16UTF16_ID:
    cs_type = CHARSET_UTF16;
    break;
    case CHARSET_WE8MSWIN1252_ID:
    cs_type = CHARSET_LATIN1;
    case CHARSET_ZHS32GB18030_2022_ID:
    cs_type = CHARSET_GB18030_2022;
    break;
    default:
    break;
  }
  return cs_type;
}

bool ObCharset::is_valid_nls_collation(ObNLSCollation nls_collation)
{
  return nls_collation > NLS_COLLATION_INVALID && nls_collation < NLS_COLLATION_MAX;
}

int ObCharset::charset_name_by_coll(const ObString &coll_name, ObString &cs_name)
{
  int ret = OB_SUCCESS;
  ObCollationType coll_type = collation_type(coll_name);
  if (OB_UNLIKELY(CS_TYPE_INVALID == coll_type)) {
    ret = OB_ERR_UNKNOWN_COLLATION;
    LOG_WARN("invalid collation type", K(ret), K(coll_name));
  } else if (OB_FAIL(charset_name_by_coll(coll_type, cs_name))) {
    LOG_WARN("fail to get charset type by collation type", K(ret), K(coll_type), K(coll_name));
  }
  return ret;
}

int ObCharset::charset_name_by_coll(ObCollationType collation_type, ObString &cs_name)
{
  int ret = OB_SUCCESS;
  if (OB_UNLIKELY(CS_TYPE_INVALID == collation_type)) {
    ret = OB_ERR_UNKNOWN_COLLATION;
    LOG_WARN("invalid collation type", K(ret), K(collation_type));
  } else {
    ObCharsetType charset_type = charset_type_by_coll(collation_type);
    if (OB_UNLIKELY(CHARSET_INVALID == charset_type)) {
      ret = OB_ERR_UNKNOWN_CHARSET;
      LOG_WARN("has no charset type of this collation type", K(ret), K(collation_type));
    } else {
      ObString tmp_cs_name = ObString(charset_name(charset_type));
      if (OB_UNLIKELY(tmp_cs_name == "invalid_type")) {
        ret = OB_ERR_UNEXPECTED;
        LOG_WARN("charset str is invalid_type", K(ret), K(charset_type), K(collation_type));
      } else {
        cs_name = tmp_cs_name;
      }
    }
  }
  return ret;
}

int ObCharset::calc_collation(
    const ObCollationLevel collation_level1,
    const ObCollationType collation_type1,
    const ObCollationLevel collation_level2,
    const ObCollationType collation_type2,
    ObCollationLevel &res_level,
    ObCollationType &res_type)
{
  return ObCharset::result_collation(collation_level1, collation_type1,
                                     collation_level2, collation_type2,
                                     res_level, res_type);
}

int ObCharset::result_collation(
    const ObCollationLevel collation_level1,
    const ObCollationType collation_type1,
    const ObCollationLevel collation_level2,
    const ObCollationType collation_type2,
    ObCollationLevel &res_level,
    ObCollationType &res_type)
{
  int ret = OB_SUCCESS;
  if (OB_UNLIKELY(CS_LEVEL_INVALID == collation_level1
      || CS_LEVEL_INVALID == collation_level2
      || CS_TYPE_INVALID == collation_type1
      || CS_TYPE_INVALID == collation_type2)) {
    ret = OB_ERR_UNEXPECTED;
    LOG_WARN("invalid collation level or type", K(collation_level1), K(collation_type1), K(collation_level2), K(collation_type2));
  } else if (collation_level1 == collation_level2) {
    if (CS_LEVEL_EXPLICIT == collation_level1 && collation_type1 != collation_type2) {
      // ERROR 1267 (HY000): Illegal mix of collations (utf8_general_ci,EXPLICIT) and (utf8_bin,EXPLICIT) for operation '='
      ret = OB_CANT_AGGREGATE_2COLLATIONS;
      // LOG_USER_ERROR(ret);
    } else {
      // just consider two collations: bin & general_ci.
      // we must change the code below if we need to support more collations.
      res_level = collation_level1;
      res_type = (collation_type1 == collation_type2) ? collation_type1 : CS_TYPE_UTF8MB4_BIN;
    }
  } else if (collation_level1 < collation_level2) {
    res_level = collation_level1;
    res_type = collation_type1;
  } else {
    res_level = collation_level2;
    res_type = collation_type2;
  }
  return ret;
}

int ObCharset::aggregate_collation(
    const ObCollationLevel collation_level1,
    const ObCollationType collation_type1,
    const ObCollationLevel collation_level2,
    const ObCollationType collation_type2,
    ObCollationLevel &res_level,
    ObCollationType &res_type)
{
  int ret = OB_SUCCESS;
  if (OB_UNLIKELY(
      CS_LEVEL_INVALID == collation_level1
      || CS_LEVEL_INVALID == collation_level2
      || !is_valid_collation(collation_type1)
      || !is_valid_collation(collation_type2))) {
    ret = OB_ERR_UNEXPECTED;
    LOG_WARN ("invalid collation level or type",
              K(ret), K(collation_level1), K(collation_type1), K(collation_level2), K(collation_type2));
  } else {
    /** 先比较level，level小的优先级大，使用相应的结果。
      * 如果优先级相同，binary和string比较，统一用binary比较
      * 如果都是string，按照规则进行处理
      */
    ObCharsetType cs1 = charset_type_by_coll(collation_type1);
    ObCharsetType cs2 = charset_type_by_coll(collation_type2);
    if (collation_level1 < collation_level2) {
      res_type = collation_type1;
      res_level = collation_level1;
    } else if (collation_level2 < collation_level1) {
      res_type = collation_type2;
      res_level = collation_level2;
    } else if (CS_TYPE_BINARY == collation_type1) {
      res_level = collation_level1;
      res_type = collation_type1;
    } else if (CS_TYPE_BINARY == collation_type2) {
      res_level = collation_level2;
      res_type = collation_type2;
    } else if (cs1 != cs2) {
        /**
        * 左右字符集不相同的情况
        * 主要以下情况
        * utf8mb4和utf16：使用utf16
        * utf8mb4和gbk：使用utf8mb4
        * utf16和gbk：使用utf16
        * utf8mb4和gb18030：使用utf8mb4
        * utf16和gb18030：使用utf16
        * gbk和gb18030：使用gb18030
        * gb18030_2022 与 gb18030 的 AGGREGATE 暂定禁止
        * 以上任一字符集X与latin1的组合结果都为X，latin1目前地位最低
        */

          int res = AGGREGATE_2CHARSET[cs1][cs2];
          if (res == 1) {
            res_type = collation_type1;
            res_level = collation_level1;
          } else if (res == 2) {
            res_type = collation_type2;
            res_level = collation_level2;
          } else {
            // 所有不能转换的情况都到这里
            ret = OB_CANT_AGGREGATE_2COLLATIONS;
          }
    } else {
      //处理相同字符集的情况，每种字符集单独考虑
      if (collation_type1 == collation_type2) {
        res_type = collation_type1;
        res_level = collation_level1;
      } else if (CS_LEVEL_EXPLICIT == collation_level1) {
        ret = OB_CANT_AGGREGATE_2COLLATIONS;
      // ERROR 1267 (HY000): Illegal mix of collations (utf8_general_ci,EXPLICIT) and (utf8_bin,EXPLICIT) for operation '='
      // LOG_USER_ERROR(ret);
      } else if (charset_type_by_coll(collation_type1) == CHARSET_UTF8MB4) {
        if (collation_type1 == CS_TYPE_UTF8MB4_BIN || collation_type2 == CS_TYPE_UTF8MB4_BIN) {
          res_type = CS_TYPE_UTF8MB4_BIN;
          res_level = (CS_TYPE_UTF8MB4_BIN == collation_type1) ? collation_level1 : collation_level2;
        } else {
          // utf8mb4_unicode_ci和utf8mb4_general_ci的情况报错，和mysql兼容
          ret = OB_CANT_AGGREGATE_2COLLATIONS;
        }
      } else if (charset_type_by_coll(collation_type1) == CHARSET_GBK) {
          res_type = CS_TYPE_GBK_BIN;
          res_level = (CS_TYPE_GBK_BIN == collation_type1) ? collation_level1 : collation_level2;
      } else if (charset_type_by_coll(collation_type1) == CHARSET_UTF16) {
        if (collation_type1 == CS_TYPE_UTF16_BIN || collation_type2 == CS_TYPE_UTF16_BIN) {
          res_type = CS_TYPE_UTF16_BIN;
          res_level = (CS_TYPE_UTF16_BIN == collation_type1) ? collation_level1 : collation_level2;
        } else {
          // utf16_unicode_ci和utf16_general_ci直接报错，不应该出现这种情况
          ret = OB_CANT_AGGREGATE_2COLLATIONS;
        }
      } else if (charset_type_by_coll(collation_type1) == CHARSET_GB18030) {
        res_type = CS_TYPE_GB18030_BIN;
        res_level = (CS_TYPE_GB18030_BIN == collation_type1) ? collation_level1 : collation_level2;
      } else if (charset_type_by_coll(collation_type1) == CHARSET_LATIN1) {
        if (collation_type1 == CS_TYPE_LATIN1_BIN || collation_type2 == CS_TYPE_LATIN1_BIN) {
          res_type = CS_TYPE_LATIN1_BIN;
          res_level = (CS_TYPE_LATIN1_BIN == collation_type1) ? collation_level1 : collation_level2;
        } else {
          //未来可能支持latin1_german,与latin1_swedish不兼容
          ret = OB_CANT_AGGREGATE_2COLLATIONS;
        }
      } else if (charset_type_by_coll(collation_type1) == CHARSET_GB18030_2022) {
        res_type = CS_TYPE_GB18030_2022_BIN;
        res_level = (CS_TYPE_GB18030_2022_BIN == collation_type1) ? collation_level1 : collation_level2;
      } else {
        ret = OB_ERR_UNEXPECTED;
        LOG_WARN("Unexpected charset", K(ret), K(collation_type1), K(collation_type2), KCSTRING(lbt()));
      }
    }

    if (OB_SUCC(ret)) {
      ObCharsetType res_cs = charset_type_by_coll(res_type);
      if (CHARSET_GB18030 == res_cs) {
        if (CHARSET_GB18030_2022 == cs1 || CHARSET_GB18030_2022 == cs2) {
          ret = OB_CANT_AGGREGATE_2COLLATIONS;
        }
      } else if (CHARSET_GB18030_2022 == res_cs) {
        if (CHARSET_GB18030 == cs1 || CHARSET_GB18030 == cs2) {
          ret = OB_CANT_AGGREGATE_2COLLATIONS;
        }
      }
    }

    if (OB_FAIL(ret)) {
      LOG_WARN("Illegal mix of collations", K(ret),
              "type1", ObCharset::collation_name(collation_type1),
              "level1", ObCharset::collation_level(collation_level1),
              "type2", ObCharset::collation_name(collation_type2),
              "level2", ObCharset::collation_level(collation_level2));
    }
  }
  return ret;
}

bool ObCharset::is_bin_sort(ObCollationType collation_type)
{
  bool ret = false;
  if (OB_UNLIKELY(collation_type <= CS_TYPE_INVALID ||
                  collation_type >= CS_TYPE_MAX) ||
                  OB_ISNULL(ObCharset::charset_arr[collation_type])) {
    LOG_WARN("unexpected error. invalid argument(s)",
              K(ret), K(collation_type), K(lbt()));
  } else {
    ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
    ret = (0 != (cs->state & OB_CS_BINSORT));
  }
  return ret;
}

ObCharsetType ObCharset::default_charset_type_ = CHARSET_UTF8MB4;
ObCollationType ObCharset::default_collation_type_ = CS_TYPE_UTF8MB4_GENERAL_CI;

ObCharsetType ObCharset::get_default_charset()
{
  return ObCharset::default_charset_type_;
}

ObCollationType ObCharset::get_default_collation(ObCharsetType charset_type)
{
  ObCollationType collation_type = CS_TYPE_INVALID;
  switch(charset_type) {
    case CHARSET_UTF8MB4: {
      collation_type = CS_TYPE_UTF8MB4_GENERAL_CI;
      break;
    }
    case CHARSET_BINARY: {
      collation_type = CS_TYPE_BINARY;
      break;
    }
    case CHARSET_GBK: {
      collation_type = CS_TYPE_GBK_CHINESE_CI;
      break;
    }
    case CHARSET_UTF16: {
      collation_type = CS_TYPE_UTF16_GENERAL_CI;
      break;
    }
    case CHARSET_GB18030: {
      collation_type = CS_TYPE_GB18030_CHINESE_CI;
      break;
    }
    case CHARSET_LATIN1: {
      collation_type = CS_TYPE_LATIN1_SWEDISH_CI;
      break;
    }
    case CHARSET_GB18030_2022: {
      collation_type = CS_TYPE_GB18030_2022_PINYIN_CI;
      break;
    }
    default: {
      break;
    }
  }
  return collation_type;
}

ObCollationType ObCharset::get_default_collation_by_mode(ObCharsetType charset_type,
                                                         bool is_oracle_mode)
{
  return is_oracle_mode ? get_default_collation_oracle(charset_type)
                        : get_default_collation(charset_type);
}

ObCollationType ObCharset::get_default_collation_oracle(ObCharsetType charset_type)
{
  ObCollationType collation_type = CS_TYPE_INVALID;
  switch(charset_type) {
    case CHARSET_UTF8MB4: {
      collation_type = CS_TYPE_UTF8MB4_BIN;
      break;
    }
    case CHARSET_BINARY: {
      collation_type = CS_TYPE_BINARY;
      break;
    }
    case CHARSET_GBK: {
      collation_type = CS_TYPE_GBK_BIN;
      break;
    }
    case CHARSET_UTF16: {
      collation_type = CS_TYPE_UTF16_BIN;
      break;
    }
    case CHARSET_GB18030: {
      collation_type = CS_TYPE_GB18030_BIN;
      break;
    }
    case CHARSET_LATIN1: {
      collation_type = CS_TYPE_LATIN1_BIN;
      break;
    }
    case CHARSET_GB18030_2022: {
      collation_type = CS_TYPE_GB18030_2022_BIN;
      break;
    }
    default: {
      break;
    }
  }
  return collation_type;
}

int ObCharset::get_default_collation(ObCharsetType charset_type, ObCollationType &collation_type)
{
  int ret = OB_SUCCESS;
  switch(charset_type) {
    case CHARSET_UTF8MB4: {
      collation_type = CS_TYPE_UTF8MB4_GENERAL_CI;
      break;
    }
    case CHARSET_BINARY: {
      collation_type = CS_TYPE_BINARY;
      break;
    }
    case CHARSET_GBK: {
      collation_type = CS_TYPE_GBK_CHINESE_CI;
      break;
    }
    case CHARSET_UTF16: {
      collation_type = CS_TYPE_UTF16_GENERAL_CI;
      break;
    }
    case CHARSET_GB18030: {
      collation_type = CS_TYPE_GB18030_CHINESE_CI;
      break;
    }
    case CHARSET_LATIN1: {
      collation_type = CS_TYPE_LATIN1_SWEDISH_CI;
      break;
    }
    case CHARSET_GB18030_2022: {
      collation_type = CS_TYPE_GB18030_2022_PINYIN_CI;
      break;
    }
    default: {
      ret = OB_INVALID_ARGUMENT;
      LOG_WARN("invalid charset type", K(ret), K(charset_type));
      break;
    }
  }
  return ret;
}

ObCollationType ObCharset::get_bin_collation(ObCharsetType charset_type)
{
  ObCollationType collation_type = CS_TYPE_INVALID;
  switch(charset_type) {
    case CHARSET_UTF8MB4: {
      collation_type = CS_TYPE_UTF8MB4_BIN;
      break;
    }
    case CHARSET_BINARY: {
      collation_type = CS_TYPE_BINARY;
      break;
    }
    case CHARSET_GBK: {
      collation_type = CS_TYPE_GBK_BIN;
      break;
    }
    case CHARSET_UTF16: {
      collation_type = CS_TYPE_UTF16_BIN;
      break;
    }
    case CHARSET_GB18030: {
      collation_type = CS_TYPE_GB18030_BIN;
      break;
    }
    case CHARSET_LATIN1: {
      collation_type = CS_TYPE_LATIN1_BIN;
      break;
    }
    case CHARSET_GB18030_2022: {
      collation_type = CS_TYPE_GB18030_2022_BIN;
      break;
    }
    default: {
      break;
    }
  }
  return collation_type;
}

int ObCharset::get_default_collation(const ObCollationType &in, ObCollationType &out)
{
  int ret = OB_SUCCESS;
  ObCharsetType charset_type = CHARSET_INVALID;
  if (OB_UNLIKELY(in == CS_TYPE_INVALID)) {
    ret = OB_ERR_UNEXPECTED;
  } else if (OB_UNLIKELY(CHARSET_INVALID == (charset_type = ObCharset::charset_type_by_coll(in)))) {
    ret = OB_ERR_UNEXPECTED;
  } else if (OB_UNLIKELY(CS_TYPE_INVALID == (out = (lib::is_mysql_mode() ?
              ObCharset::get_default_collation(charset_type)
            : ObCharset::get_default_collation_oracle(charset_type))))) {
    ret = OB_ERR_UNEXPECTED;
  }
  return ret;
}

ObCollationType ObCharset::get_system_collation()
{
  return CS_TYPE_UTF8MB4_GENERAL_CI;
}

int ObCharset::first_valid_char(
    const ObCollationType collation_type,
    const char *buf,
    const int64_t buf_size,
    int64_t &char_len)
{
  int ret = OB_SUCCESS;
  if (OB_UNLIKELY(collation_type <= CS_TYPE_INVALID ||
                  collation_type >= CS_TYPE_MAX)) {
    ret = OB_ERR_UNEXPECTED;
    LOG_WARN("unexpected error. invalid argument(s)",
              K(ret), K(collation_type));
  } else if (OB_ISNULL(ObCharset::charset_arr[collation_type])) {
    ret = OB_NOT_SUPPORTED;
    LOG_WARN("unsupported charset or collation", K(ret), K(collation_type));
  } else if (OB_UNLIKELY(NULL == buf)) {
    ret = OB_NOT_INIT;
    LOG_WARN("Null buffer passed in", K(ret), KP(buf));
  } else if (buf_size <= 0) {
    char_len = 0;
  } else {
    int error = 0;
    int64_t len = 0;
    ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
    if (OB_ISNULL(cs->cset)) {
      ret = OB_ERR_UNEXPECTED;
      LOG_WARN("unexpected error. invalid argument(s)", K(cs), K(cs->cset));
    } else {
      len = static_cast<int64_t>(cs->cset->well_formed_len(cs, buf, buf + buf_size, 1, &error));
      if (OB_LIKELY(0 == error)) {
        char_len = len;
      } else {
        ret = OB_INVALID_ARGUMENT;
        LOG_WARN("invalid encoding found");
      }
    }
  }
  return ret;
}

int ObCharset::last_valid_char(
    const ObCollationType collation_type,
    const char *buf,
    const int64_t buf_size,
    int64_t &char_len)
{
  int ret = OB_SUCCESS;
  ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);

  if (OB_ISNULL(cs)) {
    ret = OB_INVALID_ARGUMENT;
    LOG_WARN("collation type is invalid", K(collation_type), K(ret));
  } else {
    if (buf_size <= 0 || OB_ISNULL(buf)) {
      char_len = 0;
    } else {
      int64_t len = 0;
      for (len = cs->mbminlen; len <= cs->mbmaxlen; ++len) {
        int error = 0;
        int real_len =
            cs->cset->well_formed_len(cs, buf + buf_size - len, buf + buf_size, len, &error);
        if (0 == error && real_len == len) {
          char_len = len;
          break;
        }
      }
      if (len > cs->mbmaxlen) {
        ret = OB_INVALID_ARGUMENT;
        LOG_WARN("invalid encoding found", K(ret), "str", ObString(buf_size, buf));
      }
    }
  }

  return ret;
}

int ObCharset::check_and_fill_info(ObCharsetType &charset_type, ObCollationType &collation_type)
{
  int ret = OB_SUCCESS;
  if (charset_type == CHARSET_INVALID && collation_type == CS_TYPE_INVALID) {
    ret = OB_ERR_UNEXPECTED;
  } else if (charset_type == CHARSET_INVALID) {
    charset_type = ObCharset::charset_type_by_coll(collation_type);
  } else if (collation_type == CS_TYPE_INVALID) {
    collation_type = ObCharset::get_default_collation(charset_type);
  } else {
    if (!ObCharset::is_valid_collation(charset_type, collation_type)) {
      ret = OB_ERR_COLLATION_MISMATCH;
      LOG_WARN("invalid collation info", K(charset_type), K(collation_type));
    }
  }
  return ret;
}

bool ObCharset::is_default_collation(ObCollationType collation_type)
{
  bool ret = false;
  switch (collation_type) {
    case CS_TYPE_UTF8MB4_GENERAL_CI:
      //fall through
    case CS_TYPE_GBK_CHINESE_CI:
    case CS_TYPE_UTF16_GENERAL_CI:
    case CS_TYPE_GB18030_CHINESE_CI:
    case CS_TYPE_LATIN1_SWEDISH_CI:
    case CS_TYPE_GB18030_2022_PINYIN_CI:
    case CS_TYPE_BINARY: {
      ret = true;
      break;
    }
    default: {
      break;
    }
  }
  return ret;
}


bool ObCharset::is_default_collation(ObCharsetType charset_type, ObCollationType collation_type)
{
  bool ret = false;
  ObCollationType default_collation_type = get_default_collation(charset_type);
  if (CS_TYPE_INVALID != default_collation_type && collation_type == default_collation_type) {
    ret = true;
  } else { /* empty */ }
  return ret;
}

int ObCharset::strcmp(const ObCollationType collation_type, const ObString &l_str,
                      const ObString &r_str)
{
  int32_t ret = 0;
  if (l_str.empty()) {
    if (!r_str.empty()) {
      ret = -1;
    }
  } else if (r_str.empty()) {
    ret = 1;
  } else {
    ret = ObCharset::strcmp(collation_type, l_str.ptr(), l_str.length(), r_str.ptr(), r_str.length());
  }
  return ret;
}

size_t ObCharset::casedn(const ObCollationType collation_type, ObString &src)
{
  size_t size = 0;
  if (!src.empty()) {
    size = casedn(collation_type, src.ptr(), src.length(), src.ptr(), src.length());
    src.set_length(static_cast<int32_t>(size));
  }
  return size;
}

size_t ObCharset::caseup(const ObCollationType collation_type, ObString &src)
{
  size_t size = 0;
  if (!src.empty()) {
    size = caseup(collation_type, src.ptr(), src.length(), src.ptr(), src.length());
    src.set_length(static_cast<int32_t>(size));
  }
  return size;
}

int ObCharset::toupper(const ObCollationType collation_type,
                       const ObString &src, ObString &dst,
                       ObIAllocator &allocator)
{
  int ret = OB_SUCCESS;
  const ObCharsetInfo *cs_info = NULL;
  if (OB_ISNULL(cs_info = get_charset(collation_type))) {
    ret = OB_INVALID_ARGUMENT;
    LOG_WARN("invalid collation type", K(ret), K(collation_type));
  } else {
    int casemulti = cs_info->caseup_multiply;
    if (1 == casemulti) {
      if (OB_FAIL(ob_write_string(allocator, src, dst))) {
        LOG_WARN("fail to copy string", K(ret), K(src));
      } else {
        size_t size = cs_info->cset->caseup(cs_info, dst.ptr(), dst.length(), dst.ptr(), dst.length());
        dst.assign_ptr(dst.ptr(), static_cast<ObString::obstr_size_t>(size));
      }
    } else {
      char *buf = NULL;
      int64_t buf_len = src.length() * casemulti;
      if (OB_ISNULL(buf = static_cast<char*>(allocator.alloc(buf_len)))) {
        ret = OB_ALLOCATE_MEMORY_FAILED;
        LOG_WARN("fail to alloc memory", K(ret));
      } else {
        size_t size = cs_info->cset->caseup(cs_info, const_cast<char*>(src.ptr()), src.length(), buf, buf_len);
        dst.assign_ptr(buf, static_cast<ObString::obstr_size_t>(size));
      }
    }
  }
  return ret;
}


int ObCharset::tolower(const ObCollationType collation_type,
                       const ObString &src, ObString &dst,
                       ObIAllocator &allocator)
{
  int ret = OB_SUCCESS;
  const ObCharsetInfo *cs_info = NULL;
  if (OB_ISNULL(cs_info = get_charset(collation_type))) {
    ret = OB_INVALID_ARGUMENT;
    LOG_WARN("invalid collation type", K(ret), K(collation_type));
  } else {
    int casemulti = cs_info->casedn_multiply;
    if (1 == casemulti) {
      if (OB_FAIL(ob_write_string(allocator, src, dst))) {
        LOG_WARN("fail to copy string", K(ret), K(src));
      } else {
        size_t size = cs_info->cset->casedn(cs_info, dst.ptr(), dst.length(), dst.ptr(), dst.length());
        dst.assign_ptr(dst.ptr(), static_cast<ObString::obstr_size_t>(size));
      }
    } else {
      char *buf = NULL;
      int64_t buf_len = src.length() * casemulti;
      if (OB_ISNULL(buf = static_cast<char*>(allocator.alloc(buf_len)))) {
        ret = OB_ALLOCATE_MEMORY_FAILED;
        LOG_WARN("fail to alloc memory", K(ret));
      } else {
        size_t size = cs_info->cset->casedn(cs_info, const_cast<char*>(src.ptr()), src.length(), buf, buf_len);
        dst.assign_ptr(buf, static_cast<ObString::obstr_size_t>(size));
      }
    }
  }
  return ret;
}


bool ObCharset::case_insensitive_equal(const ObString &one,
                                       const ObString &another,
                                       const ObCollationType &collation_type) {
  return 0 == strcmp(collation_type, one, another);
}

bool ObCharset::case_sensitive_equal(const ObString &one, const ObString &another)
{
  return 0 == strcmp(CS_TYPE_UTF8MB4_BIN, one, another);
}

//当租户模式为mysql时,不敏感匹配,租户模式为oracle时,敏感匹配
bool ObCharset::case_compat_mode_equal(const ObString &one, const ObString &another)
{
  return lib::is_oracle_mode() ?
         case_sensitive_equal(one, another) :
         case_insensitive_equal(one, another);
}
/* for db objects' name use, like column names, table names; on oracle mode, trailing spaces are always part of the hash calc
 * although trailing spaces are not allowed in db object's name, "a" and "a " are two different names in Oracle
 * if you want to use this hash fun in other places, please contact @maoli */
uint64_t ObCharset::hash(const ObCollationType collation_type, const ObString &str,
                         uint64_t seed, hash_algo hash_algo)
{
  uint64_t ret = 0;
  if (!str.empty()) {
    ret = ObCharset::hash(collation_type, str.ptr(), str.length(),
                          seed, lib::is_oracle_mode(), hash_algo);
  }
  return ret;
}

/* for db objects' name use, like column names, table names; on oracle mode, trailing spaces are always part of the hash calc
 * although trailing spaces are not allowed in db object's name, "a" and "a " are two different names in Oracle
 * if you want to use this hash fun in other places, please contact @xiaofeng.lby */
uint64_t ObCharset::hash(
    const ObCollationType collation_type, const ObString &str,
    uint64_t seed, const bool calc_end_space, hash_algo hash_algo)
{
  uint64_t ret = 0;
  if (!str.empty()) {
    ret = ObCharset::hash(collation_type, str.ptr(), str.length(),
                          seed, calc_end_space, hash_algo);
  }
  return ret;
}

bool ObCharset::case_mode_equal(const ObNameCaseMode case_mode, const ObString &one,
                                const ObString &another)
{
  bool is_equal = false;
  if (OB_UNLIKELY(OB_NAME_CASE_INVALID >= case_mode ||
                  case_mode >= OB_NAME_CASE_MAX)) {
    LOG_WARN_RET(OB_INVALID_ARGUMENT, "unexpected error. invalid cast_mode",
              K(case_mode), K(ret), K(lbt()));
  } else {
    ObCollationType collation_type = CS_TYPE_INVALID;
    if (OB_ORIGIN_AND_SENSITIVE == case_mode) {
      collation_type = CS_TYPE_UTF8MB4_BIN;
    } else if (OB_ORIGIN_AND_INSENSITIVE == case_mode ||
              OB_LOWERCASE_AND_INSENSITIVE == case_mode) {
      collation_type = CS_TYPE_UTF8MB4_GENERAL_CI;
    }

    if (0 == strcmp(collation_type, one, another)) {
      is_equal = true;
    }
  }
  return is_equal;
}

bool ObCharset::is_space(const ObCollationType collation_type, char c)
{
  bool ret = false;
  if (OB_UNLIKELY(collation_type <= CS_TYPE_INVALID ||
                  collation_type >= CS_TYPE_MAX) ||
                  OB_ISNULL(ObCharset::charset_arr[collation_type])) {
    LOG_WARN("unexpected error. invalid argument(s)",
              K(ret), K(collation_type), K(lbt()));
  } else {
    ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
    ret = ob_isspace(cs, c);
  }
  return ret;
}

bool ObCharset::is_graph(const ObCollationType collation_type, char c)
{
  bool ret = false;
  if (OB_UNLIKELY(collation_type <= CS_TYPE_INVALID ||
                  collation_type >= CS_TYPE_MAX) ||
                  OB_ISNULL(ObCharset::charset_arr[collation_type])) {
    LOG_WARN("unexpected error. invalid argument(s)",
              K(ret), K(collation_type), K(lbt()));
  } else {
    ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
    ret = ob_isgraph(cs, c);
  }
  return ret;
}

bool ObCharset::usemb(const ObCollationType collation_type)
{
  bool ret = false;
  if (OB_UNLIKELY(collation_type <= CS_TYPE_INVALID ||
                  collation_type >= CS_TYPE_MAX) ||
                  OB_ISNULL(ObCharset::charset_arr[collation_type])) {
    LOG_WARN("unexpected error. invalid argument(s)",
              K(ret), K(collation_type), K(lbt()));
  } else {
    ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
    ret = use_mb(cs);
  }
  return ret;
}

int ObCharset::is_mbchar(const ObCollationType collation_type, const char *str, const char *end)
{
  bool ret = false;
  if (OB_UNLIKELY(collation_type <= CS_TYPE_INVALID ||
                  collation_type >= CS_TYPE_MAX) ||
                  OB_ISNULL(ObCharset::charset_arr[collation_type])) {
    LOG_WARN("unexpected error. invalid argument(s)",
              K(ret), K(collation_type), K(lbt()));
  } else {
    ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
    ret = ob_ismbchar(cs, str, end);
  }
  return ret;
}

const ObCharsetInfo *ObCharset::get_charset(const ObCollationType collation_type)
{
  ObCharsetInfo *ret = NULL;
  if (OB_UNLIKELY(collation_type <= CS_TYPE_INVALID ||
                  collation_type >= CS_TYPE_MAX)) {
    LOG_WARN_RET(OB_INVALID_ARGUMENT, "unexpected error. invalid argument(s)", K(collation_type), K(lbt()));
  } else {
    ret = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
  }
  return ret;
}

int ObCharset::get_mbmaxlen_by_coll(const ObCollationType collation_type, int64_t &mbmaxlen)
{
  int ret = OB_SUCCESS;
  if (OB_UNLIKELY(collation_type <= CS_TYPE_INVALID ||
                  collation_type >= CS_TYPE_MAX)) {
    ret = OB_ERR_UNEXPECTED;
    LOG_WARN("unexpected error. invalid argument(s)",
              K(ret), K(collation_type));
  } else if (OB_ISNULL(ObCharset::charset_arr[collation_type])) {
    ret = OB_NOT_SUPPORTED;
    LOG_WARN("unsupported charset or collation", K(ret), K(collation_type));
  } else {
    ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
    mbmaxlen = cs->mbmaxlen;
  }
  return ret;
}

int ObCharset::get_mbminlen_by_coll(const ObCollationType collation_type, int64_t &mbminlen)
{
  int ret = OB_SUCCESS;
  if (OB_UNLIKELY(collation_type <= CS_TYPE_INVALID ||
                  collation_type >= CS_TYPE_MAX)) {
    ret = OB_ERR_UNEXPECTED;
    LOG_WARN("unexpected error. invalid argument(s)",
              K(ret), K(collation_type));
  } else if (OB_ISNULL(ObCharset::charset_arr[collation_type])) {
    ret = OB_NOT_SUPPORTED;
    LOG_WARN("unsupported charset or collation", K(ret), K(collation_type));
  } else {
    ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
    mbminlen = cs->mbminlen;
  }
  return ret;
}

/*in order to prevent a char from be splitted into 2 blocks
We have to get the right bound of a string in terms a block
Take "我爱你" as an example
if len_limit_in_byte = 8 which means that the max size of a block is 8 Bytes
since '我' and '爱' takes 6 Bytes in total already.
and '你' takes 3 Bytes.
if we assign the '你' to the block
then the total length will be 9 which is greater than 8
so , byte_num = 6  and char_num = 2 will be returned.
and '你' has to be assigned to another block.

Please note that:

byte_num and char_num should not be used if the status returned by this func is not ob_success!

*/

int ObCharset::fit_string(const ObCollationType collation_type,
                          const char *str,
                          const int64_t str_len,
                          const int64_t len_limit_in_byte,
                          int64_t &byte_num,
                          int64_t &char_num)
{
  int ret = OB_SUCCESS;
  if (OB_UNLIKELY(collation_type <= CS_TYPE_INVALID ||
                  collation_type >= CS_TYPE_MAX) ||
                  len_limit_in_byte <= 0 ||
                  str_len <= 0 ||
                  OB_ISNULL(str)) {
    ret = OB_ERR_UNEXPECTED;
    LOG_WARN("unexpected error. invalid argument(s)",
        K(collation_type), KP(str), K(str_len), K(len_limit_in_byte));
  } else if (OB_ISNULL(ObCharset::charset_arr[collation_type])) {
    ret = OB_NOT_SUPPORTED;
    LOG_WARN("unsupported charset or collation", K(ret), K(collation_type));
  } else {
    ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
    byte_num = 0;
    char_num = 0;
    int64_t max_len =  std::min(str_len, len_limit_in_byte);
    const char *buf_start = str;
    const char *buf_end = str + str_len;
    int64_t char_len = 0;
    int error = 0;
    while(buf_start < buf_end) {
      char_len = static_cast<int64_t>(cs->cset->well_formed_len(cs, buf_start, buf_end, 1, &error));
      if (OB_UNLIKELY(0 != error || char_len <= 0)) {
        ret = OB_INVALID_ARGUMENT;
        break;
      } else if (OB_UNLIKELY(byte_num > max_len - char_len)) {
        break;
      } else {
        byte_num += char_len;
        buf_start += char_len;
        ++char_num;
      }
    }
  }
  return ret;
}

inline bool ObCharset::is_argument_valid(const ObCharsetInfo *cs, const char *str, int64_t str_len)
{
  //the unexpected case is str is null while str_len is not zero at the same time
  //Yeah, this is obvious. But... Wait a second !
  //What if str is null and str_len is zero which means empty string?
  //Do not worry at all. the routine called (like cs->cset->xxxx) will deal with this
  bool is_arg_valid = true;
  if ((OB_ISNULL(str) && OB_UNLIKELY(0 != str_len)) ||
      OB_UNLIKELY(str_len < 0) ||
      OB_ISNULL(cs) ||
      OB_ISNULL(cs->cset)) {
    is_arg_valid = false;
    const ObFatalErrExtraInfoGuard *extra_info = ObFatalErrExtraInfoGuard::get_thd_local_val_ptr();
    BACKTRACE_RET(WARN, OB_INVALID_ARGUMENT, true, "invalid argument. charset info = %p, str = %p, str_len = %ld, extra_info=(%s), lbt=(%s)", cs, str, str_len, (NULL == extra_info) ? NULL : to_cstring(*extra_info), lbt());
  }
  return is_arg_valid;
}
inline bool ObCharset::is_argument_valid(const ObCollationType collation_type, const char *str1, int64_t str_len1, const char *str2, int64_t str_len2)
{
  bool is_arg_valid = true;
  if (OB_UNLIKELY(collation_type <= CS_TYPE_INVALID || collation_type >= CS_TYPE_MAX) ||
      OB_ISNULL(ObCharset::charset_arr[collation_type]) ||
      OB_UNLIKELY(str_len1 < 0) ||
      OB_UNLIKELY(str_len2 < 0) ||
      (OB_ISNULL(str1) && OB_UNLIKELY(0 != str_len1)) ||
      (OB_ISNULL(str2) && OB_UNLIKELY(0 != str_len2))) {
    is_arg_valid = false;
    const ObFatalErrExtraInfoGuard *extra_info = ObFatalErrExtraInfoGuard::get_thd_local_val_ptr();
    BACKTRACE_RET(WARN, OB_INVALID_ARGUMENT, true, "invalid argument."
        "collation_type = %d,"
        "str1 = %p,"
        "str1_len = %ld,"
        "str2 = %p,"
        "str2_len = %ld,"
        "extra_info=(%s),"
        "lbt=(%s)", collation_type, str1, str_len1, str2, str_len2,
        (NULL == extra_info) ? NULL : to_cstring(*extra_info), lbt());
  } else {
    ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
    if (OB_ISNULL(cs->cset) || OB_ISNULL(cs->coll)) {
      is_arg_valid = false;
      BACKTRACE_RET(WARN, OB_INVALID_ARGUMENT, true, "invalid argument."
          "collation_type = %d,"
          "str1 = %p,"
          "str1_len = %ld,"
          "str2 = %p,"
          "str2_len = %ld,"
          "charset handler = %p,"
          "collation handler = %p,"
          "lbt=(%s)", collation_type, str1, str_len1, str2, str_len2, cs->cset, cs->coll, lbt());
    }
  }
  return is_arg_valid;
}

int ObCharset::get_aggregate_len_unit(const ObCollationType collation_type, bool &len_in_byte)
{
  int ret = OB_SUCCESS;
  len_in_byte = false;
  ObCharsetType res_charset = ObCharset::charset_type_by_coll(collation_type);
  if (CHARSET_UTF8MB4 == res_charset
      || CHARSET_LATIN1 == res_charset
      || CHARSET_UTF16 == res_charset
      || CHARSET_GBK == res_charset
      || CHARSET_GB18030 == res_charset
      || CHARSET_GB18030_2022 == res_charset) {
    len_in_byte = false;
  } else if (CHARSET_BINARY == res_charset) {
    len_in_byte = true;
  } else {
    ret = OB_ERR_UNEXPECTED;
    LOG_WARN("unexpected charset", K(ret), K(res_charset), K(collation_type));
  }
  return ret;
}

//进行字符集之间的转换，from_type为源字符集，to_type为目标字符集
int ObCharset::charset_convert(const ObCollationType from_type,
                               const char *from_str,
                               const uint32_t from_len,
                               const ObCollationType to_type,
                               char *to_str,
                               int64_t to_len,
                               uint32_t &result_len,
                               bool trim_incomplete_tail,
                               bool report_error /*true*/,
                               const ob_wc_t replaced_char /*'?'*/) {
  int ret = OB_SUCCESS;
  if (NULL == from_str || from_len <=0) {
    result_len = 0;
  } else if (OB_UNLIKELY(from_type <= CS_TYPE_INVALID
                  || from_type >= CS_TYPE_MAX
                  || to_type <= CS_TYPE_INVALID
                  || to_type >= CS_TYPE_MAX
                  || (OB_ISNULL(to_str)
                  || OB_UNLIKELY(to_len <= 0 || to_len > UINT32_MAX)))) {
    ret = OB_INVALID_ARGUMENT;
    LOG_WARN("invalid convert", K(ret), K(from_type), K(to_type),
             K(ObString(from_len, from_str)), KP(to_str), K(from_len), K(to_len), KCSTRING(lbt()));
  } else {
    ObCharsetInfo *from_cs = static_cast<ObCharsetInfo*>(ObCharset::charset_arr[from_type]);
    ObCharsetInfo *to_cs = static_cast<ObCharsetInfo*>(ObCharset::charset_arr[to_type]);
    ObCharsetType src_cs = ObCharset::charset_type_by_coll(from_type);
    ObCharsetType dst_cs = ObCharset::charset_type_by_coll(to_type);
    if ((src_cs == CHARSET_GB18030 && dst_cs == CHARSET_GB18030_2022) ||
        (src_cs == CHARSET_GB18030_2022 && dst_cs == CHARSET_GB18030)) {
      /** GB18030 and GB18030_2022 have the same code points,
        *  but they have different mapping to unicode.
        *  So, we do charset_convert from the charset to the same charset*/
      to_cs = from_cs;
    }
    if (OB_ISNULL(from_cs) || OB_ISNULL(to_cs)) {
      ret = OB_ERR_UNEXPECTED;
      LOG_WARN("unexpected collation type", K(ret), K(from_type), K(to_type));
    } else {
      unsigned int errors = 0;
      result_len = ob_convert(to_str, static_cast<uint32_t>(to_len), to_cs, from_str, from_len, from_cs,
                              trim_incomplete_tail, replaced_char, &errors);
      if (OB_UNLIKELY(errors != 0 && report_error)) {
        ret = OB_ERR_INCORRECT_STRING_VALUE;
        LOG_WARN("ob_convert failed", K(ret), K(errors),
                K(from_type), K(to_type),
                "from_charset", from_cs->csname, "to_charset", to_cs->csname,
                K(ObString(from_len, from_str)),
                K(to_len), KPHEX(from_str, from_len));
      }
    }
  }

  return ret;
}

int ObCharset::charset_convert(ObIAllocator &alloc,
                               const ObString &in,
                               const ObCollationType src_cs_type,
                               const ObCollationType dst_cs_type,
                               ObString &out,
                               int64_t convert_flag,
                               int64_t *action_flag)
{
  int ret = OB_SUCCESS;
  if (!is_valid_collation(src_cs_type) || !is_valid_collation(dst_cs_type)) {
    ret = OB_INVALID_ARGUMENT;
    LOG_WARN("invalid collation type", K(ret), K(src_cs_type), K(dst_cs_type));
  } else  {
    if ((0 == in.length()
         || charset_type_by_coll(src_cs_type) == charset_type_by_coll(dst_cs_type)
         || charset_type_by_coll(dst_cs_type) == CHARSET_BINARY)
        && !(convert_flag & REPLACE_UNKNOWN_CHARACTER_ON_SAME_CHARSET)) {
      if (!(convert_flag & COPY_STRING_ON_SAME_CHARSET)) {
        out = in;
      } else {
        if (OB_FAIL(ob_write_string(alloc, in, out))) {
          LOG_WARN("fail to write string", K(ret), K(in));
        }
      }
    } else if (charset_type_by_coll(src_cs_type) == CHARSET_BINARY) {
      char *buf = nullptr;
      int32_t align_offset = 0;
      int32_t res_buf_len = 0;
      int mbminlen = ObCharset::get_charset(dst_cs_type)->mbminlen;
      if (mbminlen > 0 && in.length() % mbminlen != 0) {
        align_offset = mbminlen - in.length() % mbminlen;
      }
      res_buf_len = in.length() + align_offset;
      if (OB_ISNULL(buf = static_cast<char*>(alloc.alloc(res_buf_len)))) {
        ret = OB_ALLOCATE_MEMORY_FAILED;
        out.reset();
        LOG_WARN("allocate memory failed", K(ret), K(in), K(align_offset));
      } else {
        MEMCPY(buf + align_offset, in.ptr(), in.length());
        MEMSET(buf, 0, align_offset);
        out.assign_ptr(buf, res_buf_len);
      }
    } else {
      int64_t maxmb_len = 0;
      if (OB_FAIL(ObCharset::get_mbmaxlen_by_coll(dst_cs_type, maxmb_len))) {
        LOG_WARN("failed to get mbmaxlen by coll", K(dst_cs_type));
      } else {
        const uint32_t res_buf_len = in.length() * maxmb_len;
        uint32_t res_len = 0;
        char *res_buf = static_cast<char *>(alloc.alloc(res_buf_len));
        if (OB_ISNULL(res_buf)) {
          ret = OB_ALLOCATE_MEMORY_FAILED;
          LOG_WARN("alloc memory failed", K(ret));
        } else  {
          if (OB_SUCC(charset_convert(src_cs_type, in.ptr(), in.length(),
                                      dst_cs_type, res_buf, res_buf_len, res_len))) {
            out.assign_ptr(res_buf, res_len);
          } else {
            //handle replace unknown character
            LOG_WARN("convert charset failed",
                    K(ret), K(in), K(src_cs_type), K(dst_cs_type),
                    KPHEX(in.ptr(), in.length()));
            if (!!(convert_flag & REPLACE_UNKNOWN_CHARACTER)
                || !!(convert_flag & REPLACE_UNKNOWN_CHARACTER_ON_SAME_CHARSET)) {
              if (OB_NOT_NULL(action_flag)) {
                if (!!(convert_flag & REPLACE_UNKNOWN_CHARACTER)) {
                  *action_flag |= REPLACE_UNKNOWN_CHARACTER;
                } else {
                  *action_flag |= REPLACE_UNKNOWN_CHARACTER_ON_SAME_CHARSET;
                }
              }
              int32_t in_offset = 0;
              int64_t res_buf_offset = 0;
              ObString question_mark = ObCharsetUtils::get_const_str(dst_cs_type, '?');
              while (in_offset < in.length()
                    && res_buf_offset + question_mark.length() <= res_buf_len) {
                ret = OB_SUCCESS;
                int64_t offset = ObCharset::charpos(src_cs_type, in.ptr() + in_offset,
                                                    in.length() - in_offset, 1, &ret);
                if (OB_SUCC(ret)) {
                  ret = ObCharset::charset_convert(src_cs_type, in.ptr() + in_offset, offset,
                      dst_cs_type, res_buf + res_buf_offset, res_buf_len - res_buf_offset, res_len);
                }
                in_offset += offset;
                if (OB_SUCCESS == ret) {
                  res_buf_offset += res_len;
                } else {
                  MEMCPY(res_buf + res_buf_offset, question_mark.ptr(), question_mark.length());
                  res_buf_offset += question_mark.length();
                }
              }
              if (in_offset < in.length()) {
                ret = OB_SIZE_OVERFLOW;
                LOG_WARN("buf size over flow", K(ret), K(in), KPHEX(in.ptr(), in.length()));
              } else {
                res_len = res_buf_offset;
                out.assign_ptr(res_buf, res_len);
                ret = OB_SUCCESS;
              }
            }
          }
        }
      }
    }
  }
  return ret;
}

int ObCharset::whitespace_padding(ObIAllocator &allocator,
                                  const ObCollationType coll_type,
                                  const ObString &input,
                                  const int64_t pad_whitespace_length,
                                  ObString &result)
{
  int ret = OB_SUCCESS;
  char *buf = NULL;
  bool is_utf16 = charset_type_by_coll(coll_type) == CHARSET_UTF16;
  int32_t buf_len = input.length() + pad_whitespace_length * (is_utf16 ? 2 : 1);
  if (OB_UNLIKELY(pad_whitespace_length <= 0)) {
    ret = OB_INVALID_ARGUMENT;
    LOG_WARN("invalid len", K(ret), K(pad_whitespace_length));
  } else if (OB_ISNULL(buf = static_cast<char*>(allocator.alloc(buf_len)))) {
    ret = OB_ALLOCATE_MEMORY_FAILED;
    LOG_WARN("no memory", K(ret), K(buf_len));
  } else {
    MEMMOVE(buf, input.ptr(), input.length());
    if (!is_utf16) {
      MEMSET(buf + input.length(), OB_PADDING_CHAR, pad_whitespace_length);
    } else {
      //UTF16 space is 0x0020
      for (int i = input.length(); i + 1 < buf_len; i+=2) {
        buf[i] = '\0';
        buf[i+1] = OB_PADDING_CHAR;
      }
      LOG_DEBUG("UTF16 padding", K(pad_whitespace_length), K(input));
    }
    result = ObString(buf_len, buf_len, buf);
  }
  return ret;
}

bool ObCharset::is_cs_nonascii(ObCollationType collation_type)
{
  bool is_cs_nonascii = false;
  if (OB_UNLIKELY(collation_type <= CS_TYPE_INVALID ||
                  collation_type >= CS_TYPE_MAX) ||
                  OB_ISNULL(ObCharset::charset_arr[collation_type])) {
    LOG_WARN_RET(OB_INVALID_ARGUMENT, "unexpected error. invalid argument(s)", K(ret), K(collation_type), K(lbt()));
  } else {
    ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
    is_cs_nonascii = !!(cs->state & OB_CS_NONASCII);
  }
  return is_cs_nonascii;
}

bool ObCharset::is_cs_unicode(ObCollationType collation_type)
{
  bool is_cs_unicode = false;
  if (OB_UNLIKELY(collation_type <= CS_TYPE_INVALID ||
                  collation_type >= CS_TYPE_MAX) ||
                  OB_ISNULL(ObCharset::charset_arr[collation_type])) {
    LOG_WARN_RET(OB_INVALID_ARGUMENT, "unexpected error. invalid argument(s)", K(ret), K(collation_type), K(lbt()));
  } else {
    ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
    is_cs_unicode = !!(cs->state & OB_CS_UNICODE);
  }
  return is_cs_unicode;
}

bool ObCharset::is_cjk_charset(ObCollationType collation_type)
{
  ObCharsetType cs_type = ObCharset::charset_type_by_coll(collation_type);
  bool is_cjk_charset = (cs_type == CHARSET_GBK ||
                         cs_type == CHARSET_GB18030 ||
                         cs_type == CHARSET_GB18030_2022);
  return is_cjk_charset;
}

bool ObCharset::is_valid_connection_collation(ObCollationType collation_type)
{
  ObCharsetType cs_type = ObCharset::charset_type_by_coll(collation_type);
  return cs_type == CHARSET_UTF8MB4
      || cs_type == CHARSET_LATIN1
      || cs_type == CHARSET_GBK
      || cs_type == CHARSET_GB18030
      || cs_type == CHARSET_GB18030_2022
      || cs_type == CHARSET_BINARY;
}

const char *ObCharset::get_oracle_charset_name_by_charset_type(ObCharsetType charset_type)
{
  const char* ret = NULL;
  switch (charset_type) {
  case CHARSET_UTF8MB4:
    ret = "AL32UTF8";
    break;
  case CHARSET_GBK:
    ret = "ZHS16GBK";
    break;
  case CHARSET_UTF16:
    ret = "AL16UTF16";
    break;
  case CHARSET_GB18030:
    ret = "ZHS32GB18030";
    break;
  case CHARSET_GB18030_2022:
    ret = "ZHS32GB18030_2022";
    break;
  case CHARSET_LATIN1:
    ret = "WE8MSWIN1252";
    break;
  default:
    break;
  }
  return ret;
}

int ObCharset::get_nls_charset_id_by_charset_type(ObCharsetType charset_type)
{
  ObNlsCharsetId ret_id = ObNlsCharsetId::CHARSET_INVALID_ID;
  switch (charset_type) {
  case CHARSET_UTF8MB4:
    ret_id = ObNlsCharsetId::CHARSET_AL32UTF8_ID;
    break;
  case CHARSET_GBK:
    ret_id = ObNlsCharsetId::CHARSET_ZHS16GBK_ID;
    break;
  case CHARSET_UTF16:
    ret_id = ObNlsCharsetId::CHARSET_AL16UTF16_ID;
    break;
  case CHARSET_GB18030:
    ret_id = ObNlsCharsetId::CHARSET_ZHS32GB18030_ID;
    break;
  case CHARSET_LATIN1:
    ret_id = ObNlsCharsetId::CHARSET_WE8MSWIN1252_ID;
    break;
  case CHARSET_GB18030_2022:
    ret_id = ObNlsCharsetId::CHARSET_ZHS32GB18030_2022_ID;
    break;
  default:
    break;
  }
  return static_cast<int>(ret_id);
}


static void ob_charset_error_reporter(enum loglevel level, unsigned int ecode, ...) {
  //UNUSED(level);
  UNUSED(ecode);
  switch (level) {
  case ERROR_LEVEL:
    LIB_LOG_RET(ERROR, OB_ERROR, "fail to init charset", K(ecode));
    break;
  case WARNING_LEVEL:
    LIB_LOG_RET(WARN, OB_ERROR, "fail to init charset", K(ecode));
    break;
  case INFORMATION_LEVEL:
    LIB_LOG(INFO, "fail to init charset", K(ecode));
    break;
  default:
    break;
  }
}

#define CHARSET_INIT_MEM_ATTR "CharsetInit"

static void *charset_malloc(size_t size) {
  return ob_malloc(size, CHARSET_INIT_MEM_ATTR);
}

static void *charset_realloc(void *ptr, size_t size) {
  ObMemAttr attr;
  attr.label_ = CHARSET_INIT_MEM_ATTR;
  return ob_realloc(ptr, size, attr);
}

static void charset_free(void *ptr) {
  return ob_free(ptr);
}

/**
  Initialize character set loader to use mysys memory management functions.
  @param loader  Loader to initialize
*/
void ob_charset_loader_init_mysys(ObCharsetLoader *loader)
{
  loader->errcode = 0;
  loader->errarg[0] = '\0';
  loader->once_alloc = charset_malloc;
  loader->mem_malloc = charset_malloc;
  loader->mem_realloc = charset_realloc;
  loader->mem_free = charset_free;
  loader->reporter = ob_charset_error_reporter;
  loader->add_collation = NULL;
}

int ObCharset::copy_zh_cs(ObCharsetInfo *from_cs, ObCollationType to_coll_type, ObCharsetInfo *&to_cs)
{
  int ret = OB_SUCCESS;
  to_cs = NULL;
  if (OB_ISNULL(to_cs = static_cast<ObCharsetInfo*>(charset_malloc(sizeof(ObCharsetInfo))))) {
    ret = OB_ALLOCATE_MEMORY_FAILED;
    LOG_WARN("fail to alloc charset", K(ret));
  } else {
    ObCollationType bin_coll = get_default_collation_oracle(charset_type_by_coll(to_coll_type));
    if (!is_valid_collation(to_coll_type) || !is_valid_collation(bin_coll)) {
      ret = OB_ERR_UNEXPECTED;
      LOG_WARN("unexpected bin coll", K(ret), K(to_coll_type), K(bin_coll));
    } else {
      *to_cs = *charset_arr[bin_coll];
      to_cs->uca = from_cs->uca;
      to_cs->tailoring = from_cs->tailoring;
      to_cs->coll_param = from_cs->coll_param;
      to_cs->levels_for_compare = 3;
      to_cs->coll = from_cs->coll;
      to_cs->pad_attribute = NO_PAD;
      //TODO
      //for now, the collations are used for nlssort and not exposed to user
      //the cs attributes are not all correct, such as names and number
    }
  }
  return ret;
}

int ObCharset::init_charset()
{
  int ret = OB_SUCCESS;
  if (OB_FAIL(init_gb18030_2022())) {
    LOG_WARN("failed to init gb18030 2022", K(ret));
  }

  auto add_coll = [&ret](ObCollationType coll_type, ObCharsetInfo *cs)->void {
    if (OB_SUCC(ret)) {
      if (OB_ISNULL(cs) || !is_valid_collation(coll_type)) {
        ret = OB_INVALID_ARGUMENT;
        LOG_WARN("invalid argument", K(ret), K(cs), K(coll_type));
      } else {
        charset_arr[coll_type] = cs;
        cs->state |= OB_CS_COMPILED;
      }
    }
  };


  ObCharsetLoader loader;
  ob_charset_loader_init_mysys(&loader);

  if (OB_SUCC(ret)) {
    auto *utf8_pinyin = &ob_charset_utf8mb4_zh_0900_as_cs;
    ObCollationHandler *pinyin_coll = ob_charset_utf8mb4_zh_0900_as_cs.coll;

    if (pinyin_coll->init(utf8_pinyin, &loader)) {
      ret = OB_ERR_UNEXPECTED;
      LOG_WARN("fail to init charset", K(ret));
    } else {
      ObCollationType pinyin_colls[] = {
        CS_TYPE_GBK_ZH_0900_AS_CS, CS_TYPE_UTF8MB4_ZH_0900_AS_CS,
        CS_TYPE_GB18030_ZH_0900_AS_CS, CS_TYPE_UTF16_ZH_0900_AS_CS,
        CS_TYPE_GB18030_2022_ZH_0900_AS_CS
      };
      add_coll(CS_TYPE_UTF8MB4_ZH_0900_AS_CS, utf8_pinyin);

      for (int i = 0; OB_SUCC(ret) && i < array_elements(pinyin_colls); i++) {
        if (NULL == charset_arr[pinyin_colls[i]]) {
          ObCharsetInfo *new_cs = NULL;
          if (OB_FAIL(copy_zh_cs(utf8_pinyin, pinyin_colls[i], new_cs))) {
            LOG_WARN("fail to copy zh cs", K(ret));
          } else {
            add_coll(pinyin_colls[i], new_cs);
          }
        }
      }
    }
  }

  if (OB_SUCC(ret)) {
    auto *utf8_radical = &ob_charset_utf8mb4_zh2_0900_as_cs;
    ObCollationHandler *radical_coll = ob_charset_utf8mb4_zh2_0900_as_cs.coll;
    if (radical_coll->init(utf8_radical, &loader)) {
      ret = OB_ERR_UNEXPECTED;
      LOG_WARN("fail to init charset", K(ret));
    } else {
      ObCollationType radical_colls[] = {
        CS_TYPE_GBK_ZH2_0900_AS_CS, CS_TYPE_UTF8MB4_ZH2_0900_AS_CS,
        CS_TYPE_GB18030_ZH2_0900_AS_CS, CS_TYPE_UTF16_ZH2_0900_AS_CS,
        CS_TYPE_GB18030_2022_ZH2_0900_AS_CS
      };
      add_coll(CS_TYPE_UTF8MB4_ZH2_0900_AS_CS, utf8_radical);

      for (int i = 0; OB_SUCC(ret) && i < array_elements(radical_colls); i++) {
        if (NULL == charset_arr[radical_colls[i]]) {
          ObCharsetInfo *new_cs = NULL;
          if (OB_FAIL(copy_zh_cs(utf8_radical, radical_colls[i], new_cs))) {
            LOG_WARN("fail to copy zh cs", K(ret));
          } else {
            add_coll(radical_colls[i], new_cs);
          }
        }
      }
    }
  }

  if (OB_SUCC(ret)) {
    auto *utf8_stroke = &ob_charset_utf8mb4_zh3_0900_as_cs;
    ObCollationHandler *stroke_coll = ob_charset_utf8mb4_zh3_0900_as_cs.coll;
    if (stroke_coll->init(utf8_stroke, &loader)) {
      ret = OB_ERR_UNEXPECTED;
      LOG_WARN("fail to init charset", K(ret));
    } else {
      ObCollationType stroke_colls[] = {
        CS_TYPE_GBK_ZH3_0900_AS_CS, CS_TYPE_UTF8MB4_ZH3_0900_AS_CS,
        CS_TYPE_GB18030_ZH3_0900_AS_CS, CS_TYPE_UTF16_ZH3_0900_AS_CS,
        CS_TYPE_GB18030_2022_ZH3_0900_AS_CS
      };
      add_coll(CS_TYPE_UTF8MB4_ZH3_0900_AS_CS, utf8_stroke);

      for (int i = 0; OB_SUCC(ret) && i < array_elements(stroke_colls); i++) {
        if (NULL == charset_arr[stroke_colls[i]]) {
          ObCharsetInfo *new_cs = NULL;
          if (OB_FAIL(copy_zh_cs(utf8_stroke, stroke_colls[i], new_cs))) {
            LOG_WARN("fail to copy zh cs", K(ret));
          } else {
            add_coll(stroke_colls[i], new_cs);
          }
        }
      }
    }
  }

  //init utf8_0900_binary
  add_coll(CS_TYPE_UTF8MB4_0900_BIN, &ob_charset_utf8mb4_0900_bin);

  return ret;
}


ObString ObCharsetUtils::const_str_for_ascii_[CHARSET_MAX][INT8_MAX + 1];

int ObCharsetUtils::remove_char_endspace(ObString &str,
                                         const ObCharsetType &charset_type) {
  int ret = OB_SUCCESS;
  const char *end = str.ptr() + str.length();
  if ((CHARSET_UTF16 == charset_type)) {
    end= (const char *) skip_trailing_space((const unsigned char *)str.ptr(), str.length(), 1);
  } else {
    end= (const char *) skip_trailing_space((const unsigned char *)str.ptr(), str.length(), 0);
  }
  if (end >= str.ptr()) {
    str.assign_ptr(str.ptr(), end - str.ptr());
  } else {
    ret = OB_ERR_UNEXPECTED;
    LOG_WARN("str len < 0", K(ret));
  }
  return ret;
}

int ObCharsetUtils::init(ObIAllocator &allocator)
{
  int ret = OB_SUCCESS;
  const int64_t buf_len = 32;
  char buf[buf_len] = {0};
  const lib::ObMemAttr attr(common::OB_SYS_TENANT_ID, "CharsetUtil");

  for (int cs_i = CHARSET_INVALID; cs_i < CHARSET_MAX; ++cs_i) {
    auto charset_type = static_cast<ObCharsetType>(cs_i);
    if (ObCharset::is_valid_charset(charset_type)) {
      ObCollationType coll_type = ObCharset::get_default_collation(charset_type);
      if (!ObCharset::is_valid_collation(coll_type)) {
        ret = OB_ERR_UNEXPECTED;
        LOG_WARN("invalid collation type", K(ret), K(charset_type), K(coll_type));
      }
      for (int ascii_wc = 0; OB_SUCC(ret) && ascii_wc <= INT8_MAX; ascii_wc++) {
        int result_len = 0;
        char *sys_buf = nullptr;

        if (OB_FAIL(ObCharset::wc_mb(coll_type, ascii_wc, buf, buf_len, result_len))) {
          LOG_WARN("fail to convert ascii to multi byte char", K(ret), K(buf_len));
        } else if (OB_ISNULL(sys_buf = static_cast<char*>(allocator.alloc(result_len, attr)))) {
          ret = OB_ALLOCATE_MEMORY_FAILED;
          LOG_WARN("fail to allocate mem", K(ret), K(result_len));
        } else {
          MEMCPY(sys_buf, buf, result_len);
          const_str_for_ascii_[charset_type][ascii_wc].assign_ptr(sys_buf, result_len);
        }
      }
    }
  }
  return ret;
}

bool ObStringScanner::next_character(ObString &encoding_value, int32_t &unicode_value, int &ret)
{
  bool has_next = false;
  ret = next_character(encoding_value, unicode_value);

  if (OB_ITER_END == ret) {
    has_next = false;
    ret = OB_SUCCESS;
  } else if (OB_SUCC(ret)) {
    has_next = true;
  } else {
    LOG_WARN("fail to get next character", K(ret), K(*this));
    has_next = false;
  }
  return has_next;
}

int ObStringScanner::next_character(ObString &encoding_value, int32_t &unicode_value)
{
  int ret = OB_SUCCESS;
  int32_t length = 0;

  ObString &str = str_;

  if (str.empty()) {
    ret = OB_ITER_END;
  } else if (OB_FAIL(ObCharset::mb_wc(collation_type_, str.ptr(), str.length(), length, unicode_value))) {
    if (!!(IGNORE_INVALID_CHARACTER & flags_)) {
      ret = OB_SUCCESS;
      length = 1;
    } else {
      ret = OB_ERR_INCORRECT_STRING_VALUE;
      LOG_WARN("fail to call mb_wc", K(ret), KPHEX(str.ptr(), str.length()));
    }
  }
  if (OB_SUCC(ret)) {
    encoding_value.assign_ptr(str.ptr(), length);
    LOG_DEBUG("next_character", K(ret), KPHEX(str.ptr(), str.length()));
    str += length;
  }
  return ret;
}

#undef CHARSET_INIT_MEM_ATTR

} // namespace common
} // namespace oceanbase
